From 29ca3b60ce9dc63eba6c610f12c3bbf835f0963d Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Mon, 18 Mar 2024 11:57:23 -0700 Subject: [PATCH] helpers: linux: add module kallsyms helpers Add Python helpers which load module kallsyms and return a symbol index for them. Unlike the /proc/kallsyms and built-in kallsyms, these are quite easy to handle using regular Python & drgn code, so implement them as Python helpers. There are (at least) two use cases for these helpers: 1. After loading CTF and built-in vmlinux kallsyms, support for module kallsyms is still necessary. 2. Sometimes, people only extract vmlinux DWARF debuginfo. Adding module symbols can allow stack traces and other symbolization to work even without module debuginfo. Signed-off-by: Stephen Brennan --- drgn/helpers/linux/kallsyms.py | 166 +++++++++++++++++++- tests/linux_kernel/helpers/test_kallsyms.py | 20 ++- 2 files changed, 182 insertions(+), 4 deletions(-) diff --git a/drgn/helpers/linux/kallsyms.py b/drgn/helpers/linux/kallsyms.py index bd2c0887e..448cbe239 100644 --- a/drgn/helpers/linux/kallsyms.py +++ b/drgn/helpers/linux/kallsyms.py @@ -13,16 +13,26 @@ """ import os import re -from typing import Dict +from typing import Dict, List, Tuple from _drgn import ( _linux_helper_load_builtin_kallsyms, _linux_helper_load_proc_kallsyms as _load_proc_kallsyms, ) -from drgn import Program, ProgramFlags, SymbolIndex +from drgn import ( + Object, + Program, + ProgramFlags, + Symbol, + SymbolBinding, + SymbolIndex, + SymbolKind, +) +from drgn.helpers.linux.module import for_each_module __all__ = ( "load_vmlinux_kallsyms", + "load_module_kallsyms", ) @@ -73,3 +83,155 @@ def load_vmlinux_kallsyms(prog: Program) -> SymbolIndex: return _load_proc_kallsyms() else: return _load_builtin_kallsyms(prog) + + +def _nm_type_to_binding_kind(code: str) -> Tuple[SymbolBinding, SymbolKind]: + binding = SymbolBinding.UNKNOWN + kind = SymbolKind.UNKNOWN + if code == "v": + binding = SymbolBinding.WEAK + kind = SymbolKind.OBJECT + elif code == "w": + binding = SymbolBinding.WEAK + elif code in "tT": + kind = SymbolKind.FUNC + elif code.lower() in "srbgncd": + kind = SymbolKind.OBJECT + if binding == SymbolBinding.UNKNOWN and code.isupper(): + binding = SymbolBinding.GLOBAL + return binding, kind + + +def _st_info_to_binding_kind(info: int) -> Tuple[SymbolBinding, SymbolKind]: + binding_int = info >> 4 + STB_WEAK = 2 + STB_GNU_UNIQUE = 10 + if binding_int <= STB_WEAK or binding_int == STB_GNU_UNIQUE: + binding = SymbolBinding(binding_int + 1) + else: + binding = SymbolBinding.UNKNOWN + type_ = info & 0xF + STT_TLS = 6 + STT_GNU_IFUNC = 10 + if type_ <= STT_TLS or type_ == STT_GNU_IFUNC: + kind = SymbolKind(type_) + else: + kind = SymbolKind.UNKNOWN + return binding, kind + + +def _elf_sym_to_symbol(name: str, obj: Object, has_typetab: bool) -> Symbol: + # Linux likes to have the nm(1) character code for its symbols, which it + # refers to as the symbol's "type" (this is of course distinct from the ELF + # notion of a symbol type, let alone what drgn considers a "type"...). + # + # Prior to 5439c985c5a8 ("module: Overwrite st_size instead of st_info"), + # merged in v5.0, the kernel simply overwrote the "st_info" field with a + # single-character code that represents the nm(1) character code for that + # symbol. However, starting with that commit, it was switched to overwrite + # the "st_size" field instead! This was thankfully fixed in v5.2 with + # 1c7651f43777 ("kallsyms: store type information in its own array"). + # + # Unfortunately, this leaves us with three possibilities: + # 1. Pre-v5.0: interpret the "st_info" as a character from nm(1) and try to + # infer the kind and bindings. + # 2. 5.0-5.2: interpret the "st_info" as normal, but ignore the "st_size" + # field since it is bogus. + # 3. 5.2+: both fields are valid, and the nm(1) code is stored in "typetab". + # + # Case 3 can be determined easily by the presence of "typetab" in "struct + # mod_kallsyms". However, cases 1 & 2 are indistinguishable. For our + # purposes, it makes more sense to fall back to case 1. After all, neither + # 5.0 or 5.1 were LTS kernels, nor are they actively used by any major + # distro. We have no way to deal with 5.0 or 5.1, whereas we can make some + # informed guesses for pre-5.0 based on the nm(1) code. + if has_typetab: + binding, kind = _st_info_to_binding_kind(obj.st_info.value_()) + else: + binding, kind = _nm_type_to_binding_kind(chr(obj.st_info.value_())) + return Symbol( # type: ignore + name, + obj.st_value.value_(), + obj.st_size.value_(), + binding, + kind, + ) + + +def _module_kallsyms(module: Object) -> List[Symbol]: + """ + Return a list of symbols for a kernel module + + When compiled with ``CONFIG_KALLSYMS``, the kernel maintains ELF symbol + information about each module within ``struct module``. This function + accesses this symbol information, and returns a list of drgn :class:`Symbol` + objects for the module. Keep in mind that unless ``CONFIG_KALLSYMS_ALL`` is + enabled, these symbols are typically only function symbols. + + :param module: :class:`Object` of type ``struct module *`` + :returns: a list of symbols + """ + try: + ks = module.kallsyms + except AttributeError: + # Prior to 8244062ef1e54 ("modules: fix longstanding /proc/kallsyms vs + # module insertion race."), the kallsyms variables were stored directly + # on the module object. This commit was introduced in 4.5, but was + # backported to some stable kernels too. Fall back to the module object + # in cases where kallsyms field isn't available. + ks = module + + prog = module.prog_ + num_symtab = ks.num_symtab.value_() + try: + ks.member_("typetab") + has_typetab = True + except LookupError: + has_typetab = False + + # The symtab field is a pointer, but it points at an array of Elf_Sym + # objects. Indexing it requires drgn to do pointer arithmetic and issue a + # lot of very small /proc/kcore reads, which can be a real performance + # issue. So convert it into an object representing a correctly-sized array, + # and then read that object all at once. This does one /proc/kcore read, + # which is a major improvement! + symtab = Object( + prog, + type=prog.array_type(ks.symtab.type_.type, num_symtab), + address=ks.symtab.value_(), + ).read_() + + # The strtab is similarly a pointer into a contigous array of strings packed + # next to each other. Reading individual strings from /proc/kcore can be + # quite slow. So read the entire array of bytes into a Python bytes value, + # and we'll extract the individual symbol strings from there. + last_string_start = symtab[num_symtab - 1].st_name.value_() + last_string_len = len(ks.strtab[last_string_start].address_of_().string_()) + 1 + strtab = prog.read(ks.strtab.value_(), last_string_start + last_string_len) + syms = [] + for i in range(ks.num_symtab.value_()): + elfsym = symtab[i] + if not elfsym.st_name: + continue + str_index = elfsym.st_name.value_() + nul_byte = strtab.find(b"\x00", str_index) + name = strtab[str_index:nul_byte].decode("ascii") + syms.append(_elf_sym_to_symbol(name, elfsym, has_typetab)) + return syms + + +def load_module_kallsyms(prog: Program) -> SymbolIndex: + """ + Return a symbol index containing all module symbols from kallsyms + + For kernels built with ``CONFIG_KALLSYMS``, loaded kernel modules contain + an ELF symbol table in kernel memory. This function can parse those data + structures and create a symbol index usable by drgn. However, it requires + that you already have debuginfo for the vmlinux image. + + :returns: a symbol index containing all symbols from module kallsyms + """ + all_symbols = [] + for module in for_each_module(prog): + all_symbols.extend(_module_kallsyms(module)) + return SymbolIndex(all_symbols) diff --git a/tests/linux_kernel/helpers/test_kallsyms.py b/tests/linux_kernel/helpers/test_kallsyms.py index 4533de2c4..104f6fae5 100644 --- a/tests/linux_kernel/helpers/test_kallsyms.py +++ b/tests/linux_kernel/helpers/test_kallsyms.py @@ -5,8 +5,12 @@ from unittest import TestCase from drgn import Symbol, SymbolBinding, SymbolKind -from drgn.helpers.linux.kallsyms import _load_builtin_kallsyms, _load_proc_kallsyms -from tests.linux_kernel import LinuxKernelTestCase +from drgn.helpers.linux.kallsyms import ( + _load_builtin_kallsyms, + _load_proc_kallsyms, + load_module_kallsyms, +) +from tests.linux_kernel import LinuxKernelTestCase, skip_unless_have_test_kmod def compare_local_symbols(self, finder, modules=False): @@ -93,3 +97,15 @@ def test_builtin_kallsyms(self): self.skipTest("VMCOREINFO is missing necessary symbols") finder = _load_builtin_kallsyms(self.prog) compare_local_symbols(self, finder) + + @skip_unless_have_test_kmod + def test_module_kallsyms(self): + finder = load_module_kallsyms(self.prog) + test_data = finder(None, "drgn_test_empty_list", None, True)[0] + self.assertEqual("drgn_test_empty_list", test_data.name) + self.assertEqual(SymbolKind.OBJECT, test_data.kind) + self.assertIn(test_data.binding, (SymbolBinding.GLOBAL, SymbolBinding.UNKNOWN)) + size = self.prog.type("struct list_head").size + self.assertEqual(size, test_data.size) + address = self.prog.object("drgn_test_empty_list").address_ + self.assertEqual(address, test_data.address)