From fda9b2566d422e698d8f9f6ab45b6d10672e7634 Mon Sep 17 00:00:00 2001 From: Stephen Brennan Date: Sun, 17 Mar 2024 09:55:05 -0700 Subject: [PATCH] libdrgn, python: add SymbolIndex The Symbol Finder API gives us the ability to register a dynamic callback for symbol lookup. However, many common use cases are satisfied by a simple static list of symbols. Correct and efficient lookup in this simple case is rather tricky. Implement a new type, SymbolIndex, which can take a list of symbols and index them for efficient lookup by name or address. Signed-off-by: Stephen Brennan --- _drgn.pyi | 63 ++++++++ docs/api_reference.rst | 1 + drgn/__init__.py | 2 + libdrgn/Makefile.am | 1 + libdrgn/python/drgnpy.h | 7 + libdrgn/python/main.c | 1 + libdrgn/python/program.c | 10 ++ libdrgn/python/symbol_index.c | 122 +++++++++++++++ libdrgn/symbol.c | 269 ++++++++++++++++++++++++++++++++++ libdrgn/symbol.h | 102 +++++++++++++ tests/test_symbol.py | 120 ++++++++++++++- 11 files changed, 697 insertions(+), 1 deletion(-) create mode 100644 libdrgn/python/symbol_index.c diff --git a/_drgn.pyi b/_drgn.pyi index a8452bb7e..4856c5b2b 100644 --- a/_drgn.pyi +++ b/_drgn.pyi @@ -1898,6 +1898,69 @@ class Symbol: kind: Final[SymbolKind] """Kind of entity represented by this symbol.""" +class SymbolIndex: + """ + A ``SymbolIndex`` contains a static set of symbols and allows efficient + lookup by name and address. + + With :meth:`Program.register_symbol_finder()`, you can add a callback to + provide custom symbol finding logic. However, in many cases, all that is + necessary is to provide drgn with a list of symbols that you know to be part + of the program. This object allows you to do that. It efficiently implements + the Symbol Finder API given a static set of symbols. For example:: + + >>> prog = drgn.Program() + >>> symbol = drgn.Symbol("foo", 0x123, 1, drgn.SymbolBinding.GLOBAL, drgn.SymbolKind.OBJECT) + >>> finder = drgn.SymbolIndex([symbol]) + >>> prog.register_symbol_finder("SymbolIndex", finder, enable_index=0) + >>> prog.symbols() + [Symbol(name='foo', address=0x123, size=0x1, binding=, kind=)] + >>> prog.symbol("bar") + Traceback (most recent call last): + File "", line 1, in + LookupError: not found + >>> prog.symbol("foo") + Symbol(name='foo', address=0x123, size=0x1, binding=, kind=) + >>> prog.symbol(0x100) + Traceback (most recent call last): + File "", line 1, in + LookupError: not found + >>> prog.symbol(0x123) + Symbol(name='foo', address=0x123, size=0x1, binding=, kind=) + """ + + def __init__(self, symbols: Iterable[Symbol]) -> None: + """ + Create a ``SymbolIndex`` from a sequence of symbols + + The returned symbol index satisfies the Symbol Finder API. It supports + overlapping symbol address ranges and duplicate symbol names. However, + in the case of these sorts of conflicts, it doesn't provide any + guarantee on the order of the results, or which result is returned when + a single symbol is requested. + + :param symbols: An iterable of symbols + :returns: A callable object suitable to provide to + :meth:`Program.register_symbol_finder()`. + """ + + def __call__( + self, + prog: Program, + name: Optional[str], + address: Optional[int], + one: bool, + ) -> List[Symbol]: + """ + Lookup symbol by name, address, or both. + + :param prog: (unused) the program looking up this symbol + :param name: if given, only return symbols with this name + :param address: if given, only return symbols spanning this address + :param one: if given, limit the result to a single symbol + :returns: a list of matching symbols (empty if none are found) + """ + class SymbolBinding(enum.Enum): """ A ``SymbolBinding`` describes the linkage behavior and visibility of a diff --git a/docs/api_reference.rst b/docs/api_reference.rst index 2cf3789c7..b3c4d7b22 100644 --- a/docs/api_reference.rst +++ b/docs/api_reference.rst @@ -109,6 +109,7 @@ Symbols .. drgndoc:: Symbol .. drgndoc:: SymbolBinding .. drgndoc:: SymbolKind +.. drgndoc:: SymbolIndex Stack Traces ------------ diff --git a/drgn/__init__.py b/drgn/__init__.py index d83c40a8e..5a03f5a30 100644 --- a/drgn/__init__.py +++ b/drgn/__init__.py @@ -70,6 +70,7 @@ StackTrace, Symbol, SymbolBinding, + SymbolIndex, SymbolKind, Thread, Type, @@ -127,6 +128,7 @@ "StackTrace", "Symbol", "SymbolBinding", + "SymbolIndex", "SymbolKind", "Thread", "Type", diff --git a/libdrgn/Makefile.am b/libdrgn/Makefile.am index 0ce6639e3..8cc46bbbe 100644 --- a/libdrgn/Makefile.am +++ b/libdrgn/Makefile.am @@ -171,6 +171,7 @@ _drgn_la_SOURCES = python/constants.c \ python/program.c \ python/stack_trace.c \ python/symbol.c \ + python/symbol_index.c \ python/test.c \ python/thread.c \ python/type.c \ diff --git a/libdrgn/python/drgnpy.h b/libdrgn/python/drgnpy.h index af2d7c4fe..8c2dcd06f 100644 --- a/libdrgn/python/drgnpy.h +++ b/libdrgn/python/drgnpy.h @@ -18,6 +18,7 @@ #include "../hash_table.h" #include "../pp.h" #include "../program.h" +#include "../symbol.h" /* These were added in Python 3.7. */ #ifndef Py_UNREACHABLE @@ -108,6 +109,11 @@ typedef struct { PyObject *attr_cache; } DrgnType; +typedef struct { + PyObject_HEAD + struct drgn_symbol_index index; +} SymbolIndex; + typedef struct { PyObject_HEAD /* @@ -242,6 +248,7 @@ extern PyTypeObject Register_type; extern PyTypeObject StackFrame_type; extern PyTypeObject StackTrace_type; extern PyTypeObject Symbol_type; +extern PyTypeObject SymbolIndex_type; extern PyTypeObject Thread_type; extern PyTypeObject ThreadIterator_type; extern PyTypeObject TypeEnumerator_type; diff --git a/libdrgn/python/main.c b/libdrgn/python/main.c index f5b164cd5..cd9e93874 100644 --- a/libdrgn/python/main.c +++ b/libdrgn/python/main.c @@ -297,6 +297,7 @@ DRGNPY_PUBLIC PyMODINIT_FUNC PyInit__drgn(void) add_type(m, &StackFrame_type) || add_type(m, &StackTrace_type) || add_type(m, &Symbol_type) || + add_type(m, &SymbolIndex_type) || add_type(m, &DrgnType_type) || add_type(m, &Thread_type) || add_type(m, &ThreadIterator_type) || diff --git a/libdrgn/python/program.c b/libdrgn/python/program.c index 56804f7c4..600991ac8 100644 --- a/libdrgn/python/program.c +++ b/libdrgn/python/program.c @@ -504,6 +504,16 @@ py_symbol_find_fn(const char *name, uint64_t addr, enum drgn_find_symbol_flags flags, void *arg, struct drgn_symbol_result_builder *builder) { + // Fast path for SymbolIndex: don't bother converting to and from Python + // types, as this is a C finder. Use Py_TYPE and pointer comparison + // directly here to avoid needing to take the GIL for + // PyObject_TypeCheck(). SymbolIndex cannot be subclassed, so the logic + // for subclass checking is unnecessary anyway. + if (Py_TYPE(PyTuple_GET_ITEM(arg, 1)) == &SymbolIndex_type) { + SymbolIndex *ix = (SymbolIndex *)PyTuple_GET_ITEM(arg, 1); + return drgn_symbol_index_find(name, addr, flags, &ix->index, builder); + } + PyGILState_guard(); _cleanup_pydecref_ PyObject *name_obj = NULL; diff --git a/libdrgn/python/symbol_index.c b/libdrgn/python/symbol_index.c new file mode 100644 index 000000000..d19467352 --- /dev/null +++ b/libdrgn/python/symbol_index.c @@ -0,0 +1,122 @@ +// Copyright (c) 2024 Oracle and/or its affiliates +// SPDX-License-Identifier: LGPL-2.1-or-later + +#include "drgnpy.h" +#include "../symbol.h" + +static void SymbolIndex_dealloc(SymbolIndex *self) +{ + drgn_symbol_index_deinit(&self->index); + Py_TYPE(self)->tp_free((PyObject *)self); +} + +static PyObject *SymbolIndex_call(SymbolIndex *self, PyObject *args, PyObject *kwargs) +{ + PyObject *prog_obj; + struct index_arg address = { .allow_none = true }; + const char *name; + static char *kwnames[] = {"prog", "name", "address", "one", NULL}; + int single; // 'p' format specifier expects an int, not bool + + if (!PyArg_ParseTupleAndKeywords(args, kwargs, "OzO&p:__call__", kwnames, + &prog_obj, &name, index_converter, &address, + &single)) + return NULL; + + unsigned int flags = 0; + if (single) + flags |= DRGN_FIND_SYMBOL_ONE; + if (!address.is_none) + flags |= DRGN_FIND_SYMBOL_ADDR; + if (name) + flags |= DRGN_FIND_SYMBOL_NAME; + + struct drgn_symbol_result_builder builder; + drgn_symbol_result_builder_init(&builder, flags & DRGN_FIND_SYMBOL_ONE); + + struct drgn_error *err = + drgn_symbol_index_find(name, address.uvalue, flags, &self->index, &builder); + if (err) + goto error; + + /* We return a list regardless */ + if (single) { + struct drgn_symbol *symbol = drgn_symbol_result_builder_single(&builder); + _cleanup_pydecref_ PyObject *list = PyList_New(symbol ? 1 : 0); + if (!list) + goto error; + if (symbol) { + PyObject *pysym = Symbol_wrap(symbol, (PyObject *)self); + if (!pysym) + goto error; + PyList_SET_ITEM(list, 0, pysym); + } + return_ptr(list); + } else { + struct drgn_symbol **syms; + size_t count; + drgn_symbol_result_builder_array(&builder, &syms, &count); + return Symbol_list_wrap(syms, count, (PyObject *)self); + } + + return NULL; +error: + drgn_symbol_result_builder_abort(&builder); + return err ? set_drgn_error(err) : NULL; +} + +static PyObject *SymbolIndex_new(PyTypeObject *subtype, PyObject *args, PyObject *kwds) +{ + static char *kwnames[] = {"symbols", NULL}; + PyObject *list_obj; + + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O", kwnames, &list_obj)) + return NULL; + + _cleanup_pydecref_ PyObject *iter = + PyObject_GetIter(list_obj); + if (!iter) + return NULL; + + _cleanup_(drgn_symbol_index_builder_deinit) + struct drgn_symbol_index_builder builder; + drgn_symbol_index_builder_init(&builder); + + for (;;) { + _cleanup_pydecref_ PyObject *item = PyIter_Next(iter); + if (!item) + break; + if (!PyObject_TypeCheck(item, &Symbol_type)) + return PyErr_Format(PyExc_TypeError, "expected sequence of Symbols"); + Symbol *sym = (Symbol *)item; + if (!drgn_symbol_index_builder_add(&builder, sym->sym)) + return PyErr_NoMemory(); + } + + if (PyErr_Occurred()) + return NULL; + + _cleanup_pydecref_ SymbolIndex *index_obj = call_tp_alloc(SymbolIndex); + if (!index_obj) + return NULL; + + struct drgn_error *err = + drgn_symbol_index_init_from_builder(&index_obj->index, + &builder); + // On error, the builder and index are already deinitialized + if (err) + return set_drgn_error(err); + + return (PyObject *)no_cleanup_ptr(index_obj); +} + +PyTypeObject SymbolIndex_type = { + PyVarObject_HEAD_INIT(NULL, 0) + .tp_name = "_drgn.SymbolIndex", + .tp_basicsize = sizeof(SymbolIndex), + .tp_dealloc = (destructor)SymbolIndex_dealloc, + .tp_flags = Py_TPFLAGS_DEFAULT, + .tp_doc = drgn_SymbolIndex_DOC, + .tp_call = (ternaryfunc)SymbolIndex_call, + .tp_new = SymbolIndex_new, +}; diff --git a/libdrgn/symbol.c b/libdrgn/symbol.c index 02ae0e7fd..786d8e0b5 100644 --- a/libdrgn/symbol.c +++ b/libdrgn/symbol.c @@ -2,13 +2,18 @@ // SPDX-License-Identifier: LGPL-2.1-or-later #include +#include #include #include +#include "binary_search.h" #include "drgn_internal.h" +#include "string_builder.h" #include "symbol.h" #include "util.h" +DEFINE_VECTOR_FUNCTIONS(symbol_vector); + LIBDRGN_PUBLIC void drgn_symbol_destroy(struct drgn_symbol *sym) { if (sym && sym->name_lifetime == DRGN_LIFETIME_OWNED) @@ -174,3 +179,267 @@ void drgn_symbol_result_builder_array(struct drgn_symbol_result_builder *builder symbolp_vector_shrink_to_fit(&builder->vector); symbolp_vector_steal(&builder->vector, syms_ret, count_ret); } + +static int name_compar(const void *lhs, const void *rhs, void *arg) +{ + struct drgn_symbol_index *ix = arg; + uint32_t left_ix = *(const uint32_t *)lhs; + uint32_t right_ix = *(const uint32_t *)rhs; + return strcmp(ix->symbols[left_ix].name, ix->symbols[right_ix].name); +} + +static int addr_compar(const void *lhs, const void *rhs) +{ + const struct drgn_symbol *left = lhs; + const struct drgn_symbol *right = rhs; + // returning a simple subtraction would not work well since these are + // unsigned + if (left->address < right->address) + return -1; + else if (left->address > right->address) + return 1; + else + return 0; +} + +struct drgn_error * +drgn_symbol_index_init(struct drgn_symbol *symbols, uint32_t count, + char *buffer, struct drgn_symbol_index *ret) +{ + ret->symbols = symbols; + ret->num_syms = count; + ret->strings = buffer; + ret->name_sort = NULL; + ret->max_addrs = NULL; + drgn_symbol_name_table_init(&ret->htab); + ret->name_sort = malloc_array(count, sizeof(ret->name_sort[0])); + if (!ret->name_sort) + goto enomem; + ret->max_addrs = malloc_array(count, sizeof(ret->max_addrs[0])); + if (!ret->max_addrs) + goto enomem; + + // In many cases (e.g kallsyms), symbols are already sorted by address, + // but not always. Check whether sorted, and if not, sort. + for (uint32_t i = 1; i < ret->num_syms; i++) { + if (ret->symbols[i - 1].address > ret->symbols[i].address) { + qsort(ret->symbols, count, sizeof(ret->symbols[0]), addr_compar); + break; + } + } + + // Kallsyms doesn't include symbol lengths, so symbols are + // non-overlapping. But this is not true in general! Symbols may + // overlap, which makes address lookup complicated. Rather than using a + // complex range data structure, we can use two binary searches, one to + // find the first symbol which could overlap with an address, and one to + // find the last symbol, and then linearly search that array. This + // performs poorly if there are symbols which span many others, but + // that's a rare case. In order to do this strategy, we need an array + // that contains the maximum address spanned by any symbol at or before + // that index. + if (ret->num_syms > 0) // in case num_syms == 0 + ret->max_addrs[0] = ret->symbols[0].address + ret->symbols[0].size; + for (uint32_t i = 1; i < ret->num_syms; i++) { + uint64_t max_addr = ret->symbols[i].address + ret->symbols[i].size; + ret->max_addrs[i] = max(ret->max_addrs[i - 1], max_addr); + } + + // Sort the "name_sort" array by name so we get runs of symbols with the + // same name + for (uint32_t i = 0; i < ret->num_syms; i++) + ret->name_sort[i] = i; + qsort_arg(ret->name_sort, ret->num_syms, sizeof(ret->name_sort[0]), + name_compar, ret); + + // For each unique symbol name, insert the range of symbol indexes + // into the hash table for fast name lookup + struct drgn_symbol_name_table_entry entry; + uint32_t current = 0; + while (current < ret->num_syms) { + const char *current_str = ret->symbols[ret->name_sort[current]].name; + uint32_t next = current + 1; + while (next < ret->num_syms) { + const char *next_str = ret->symbols[ret->name_sort[next]].name; + if (strcmp(current_str, next_str) != 0) + break; + next++; + } + + entry.key = current_str; + entry.value.start = current; + entry.value.end = next; + if (drgn_symbol_name_table_insert(&ret->htab, &entry, NULL) < 0) + goto enomem; + + current = next; + } + return NULL; + +enomem: + drgn_symbol_index_deinit(ret); + return &drgn_enomem; +} + +void +drgn_symbol_index_deinit(struct drgn_symbol_index *index) +{ + // The symbol array is contiguous and all names come from strings + free(index->symbols); + free(index->max_addrs); + drgn_symbol_name_table_deinit(&index->htab); + free(index->strings); + free(index->name_sort); + // Simplify error handling by ensuring deinit is safe to call twice + memset(index, 0, sizeof(*index)); +} + +static void address_search_range(struct drgn_symbol_index *index, uint64_t address, + uint32_t *start_ret, uint32_t *end_ret) +{ + // First, identify the maximum symbol index which could possibly contain + // this address. Think of this as: + // end_ret = bisect_right([s.address for s in symbols], address) + #define less_than_start(a, b) (*(a) < (b)->address) + *end_ret = binary_search_gt(index->symbols, index->num_syms, &address, + less_than_start); + #undef less_than_start + + // Second, identify first symbol index which could possibly contain this + // address. We need to use "max_addrs" for this task: + // bisect_right(max_addrs, address) + #define less_than_end(a, b) (*(a) < *(b)) + *start_ret = binary_search_gt(index->max_addrs, index->num_syms, &address, + less_than_end); + #undef less_than_end +} + +/** Allocate a copy of the symbol and add to it the builder */ +static bool add_symbol_result(struct drgn_symbol_result_builder *builder, + struct drgn_symbol *symbol) +{ + struct drgn_symbol *copy = malloc(sizeof(*copy)); + if (!copy) + return false; + *copy = *symbol; + if (!drgn_symbol_result_builder_add(builder, copy)) { + free(copy); + return false; + } + return true; +} + +struct drgn_error * +drgn_symbol_index_find(const char *name, uint64_t address, + enum drgn_find_symbol_flags flags, void *arg, + struct drgn_symbol_result_builder *builder) +{ + struct drgn_symbol_index *index = arg; + + // Unlike the ELF symbol finder, we don't have any particular rules + // about which symbols get priority when looking up a single symbol. + // If we decide this logic is critical, it would probably make sense to + // move it into the symbol finder's API via the result builder, rather + // than reimplementing it here. + + if (flags & DRGN_FIND_SYMBOL_ADDR) { + uint32_t start, end; + address_search_range(index, address, &start, &end); + for (uint32_t i = start; i < end; i++) { + struct drgn_symbol *s = &index->symbols[i]; + if (s->address > address || address >= s->address + s->size) + continue; + if ((flags & DRGN_FIND_SYMBOL_NAME) && + strcmp(s->name, name) != 0) + continue; + if (!add_symbol_result(builder, s)) + return &drgn_enomem; + if (flags & DRGN_FIND_SYMBOL_ONE) + break; + } + } else if (flags & DRGN_FIND_SYMBOL_NAME) { + struct drgn_symbol_name_table_iterator it = + drgn_symbol_name_table_search(&index->htab, &name); + if (!it.entry) + return NULL; + for (uint32_t i = it.entry->value.start; i < it.entry->value.end; i++) { + struct drgn_symbol *s = &index->symbols[index->name_sort[i]]; + if (!add_symbol_result(builder, s)) + return &drgn_enomem; + if (flags & DRGN_FIND_SYMBOL_ONE) + break; + } + } else { + for (int i = 0; i < index->num_syms; i++) { + struct drgn_symbol *s = &index->symbols[i]; + if (!add_symbol_result(builder, s)) + return &drgn_enomem; + if (flags & DRGN_FIND_SYMBOL_ONE) + break; + } + } + return NULL; +} + +void +drgn_symbol_index_builder_init(struct drgn_symbol_index_builder *builder) +{ + builder->names = (struct string_builder)STRING_BUILDER_INIT; + symbol_vector_init(&builder->symbols); +} + +void +drgn_symbol_index_builder_deinit(struct drgn_symbol_index_builder *builder) +{ + string_builder_deinit(&builder->names); + symbol_vector_deinit(&builder->symbols); +} + +bool +drgn_symbol_index_builder_add(struct drgn_symbol_index_builder *builder, + const struct drgn_symbol *ptr) +{ + struct drgn_symbol copy = *ptr; + + // Temporarily store the index into the name + copy.name = (char *)builder->names.len; + return string_builder_append(&builder->names, ptr->name) + && string_builder_appendc(&builder->names, '\0') + && symbol_vector_append(&builder->symbols, ©); +} + +struct drgn_error * +drgn_symbol_index_init_from_builder(struct drgn_symbol_index *index, + struct drgn_symbol_index_builder *builder) +{ + size_t names_len = builder->names.len; + char *names = string_builder_steal(&builder->names); + char *tmp_names = realloc(names, names_len); + if (tmp_names) + names = tmp_names; + + symbol_vector_shrink_to_fit(&builder->symbols); + struct drgn_symbol *symbols; + size_t num_syms; + symbol_vector_steal(&builder->symbols, &symbols, &num_syms); + + // Now that the name array is finalized, resolve the names to real + // pointers. Update the name lifetime to static, reflecting that the + // symbol name is owned by the finder whose lifetime is bound to the + // program's once it is attached. + for (size_t i = 0; i < num_syms; i++) { + size_t string_index = (size_t)symbols[i].name; + symbols[i].name = &names[string_index]; + symbols[i].name_lifetime = DRGN_LIFETIME_STATIC; + } + + if (num_syms > UINT32_MAX) { + free(names); + free(symbols); + return drgn_error_format(DRGN_ERROR_OUT_OF_BOUNDS, + "too many symbols provided: %zu > %" PRIu32, + num_syms, UINT32_MAX); + } + + return drgn_symbol_index_init(symbols, num_syms, names, index); +} diff --git a/libdrgn/symbol.h b/libdrgn/symbol.h index 4a2caf1c5..410ebe2a3 100644 --- a/libdrgn/symbol.h +++ b/libdrgn/symbol.h @@ -1,4 +1,5 @@ // Copyright (c) Meta Platforms, Inc. and affiliates. +// Copyright (c) 2024, Oracle and/or its affiliates. // SPDX-License-Identifier: LGPL-2.1-or-later #ifndef DRGN_SYMBOL_H @@ -9,6 +10,8 @@ #include "cleanup.h" #include "drgn_internal.h" #include "handler.h" +#include "hash_table.h" +#include "string_builder.h" #include "vector.h" struct drgn_symbol { @@ -64,4 +67,103 @@ void drgn_symbol_result_builder_array(struct drgn_symbol_result_builder *builder struct drgn_error * drgn_symbol_copy(struct drgn_symbol *dst, struct drgn_symbol *src); +DEFINE_HASH_MAP(drgn_symbol_name_table, const char *, + struct { uint32_t start; uint32_t end; }, + c_string_key_hash_pair, c_string_key_eq); + +/** + * An index of symbols, supporting efficient lookup by name or address + * + * While the dynamic symbol finding callback is a very flexible API, many use + * cases can be served best by simply providing drgn with a known symbol table + * to index. Drgn can efficiently implement the name and address lookup + * functions once, and provide a symbol finder implementation, so that clients + * need not redo this boilerplate. + * + * In the interest of simplicity, the index is immutable once created. This + * allows us to use simple data structures. If the symbol table needs frequent + * updates, then registering a custom symbol finder should be preferred. + */ +struct drgn_symbol_index { + /** Array of symbols, in sorted order by address */ + struct drgn_symbol *symbols; + + /** Array of max_addr, to aid address lookup */ + uint64_t *max_addrs; + + /** Number of symbols */ + uint32_t num_syms; + + /** The buffer containing all symbol names */ + char *strings; + + /** Array of symbol indices, sorted by name. Used by the htab. */ + uint32_t *name_sort; + + /** Map of symbol names to index */ + struct drgn_symbol_name_table htab; +}; + +/** + * Create a symbol index from an array of symbols + * + * This takes ownership of the symbol array and the individual symbols. The @a + * buffer argument allows us to provide a single backing buffer for all strings + * (in which case the lifetimes of each symbol name should be static). On error + * @a symbols and @a buffer are already freed, since the builder took ownership + * of them. + */ +struct drgn_error * +drgn_symbol_index_init(struct drgn_symbol *symbols, uint32_t count, + char *buffer, struct drgn_symbol_index *ret); + +/** Deinitialize the symbol index. Safe to call multiple times. */ +void drgn_symbol_index_deinit(struct drgn_symbol_index *index); + +DEFINE_VECTOR_TYPE(symbol_vector, struct drgn_symbol); + +struct drgn_symbol_index_builder { + struct string_builder names; + struct symbol_vector symbols; +}; + +/** + * Create a symbol builder which will efficiently pack string names next + * to each other in memory, rather than allocating many small strings. + */ +void +drgn_symbol_index_builder_init(struct drgn_symbol_index_builder *builder); + +/** + * For destroying a builder on error conditions. It is safe to call this + * multiple times, including after drgn_symbol_index_init_from_builder(). + */ +void +drgn_symbol_index_builder_deinit(struct drgn_symbol_index_builder *builder); + +/** + * Add symbol to the builder: the builder does not take ownership of @a ptr, + * instead making a copy. + */ +bool +drgn_symbol_index_builder_add(struct drgn_symbol_index_builder *builder, + const struct drgn_symbol *ptr); + +/** + * Convert the builder to a symbol index, destroying the builder. + * On error, the builder and symbol index are both deinitialized, requiring no + * further cleanup. + */ +struct drgn_error * +drgn_symbol_index_init_from_builder(struct drgn_symbol_index *index, + struct drgn_symbol_index_builder *builder); + +/** + * The actual implementation of the Symbol Finder API. + */ +struct drgn_error * +drgn_symbol_index_find(const char *name, uint64_t address, + enum drgn_find_symbol_flags flags, void *arg, + struct drgn_symbol_result_builder *builder); + #endif /* DRGN_SYMBOL_H */ diff --git a/tests/test_symbol.py b/tests/test_symbol.py index ee84c7e29..d9cc3dd94 100644 --- a/tests/test_symbol.py +++ b/tests/test_symbol.py @@ -3,7 +3,7 @@ import tempfile from _drgn_util.elf import ET, PT, SHT, STB, STT -from drgn import Program, Symbol, SymbolBinding, SymbolKind +from drgn import Program, Symbol, SymbolBinding, SymbolIndex, SymbolKind from tests import TestCase from tests.dwarfwriter import dwarf_sections from tests.elfwriter import ElfSection, ElfSymbol, create_elf_file @@ -343,3 +343,121 @@ def test_many_without_filter(self): self.expect_args(None, None, False) self.assertEqual(self.prog.symbols(), self.TEST_SYMS) self.assertTrue(self.called) + + +class TestSymbolIndex(TestCase): + # Symbols are listed here in order of address, but are shuffled below + AA = Symbol("AA", 10, 5, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + BB = Symbol("BB", 12, 1, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + CC = Symbol("CC", 13, 8, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + DD = Symbol("DD", 28, 5, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + EE = Symbol("EE", 34, 1, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + FF = Symbol("FF", 34, 10, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + GG = Symbol("GG", 34, 2, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + BB2 = Symbol("BB", 36, 3, SymbolBinding.GLOBAL, SymbolKind.OBJECT) + + TEST_SYMS = [GG, BB, AA, BB2, CC, FF, DD, EE] + + def setUp(self): + # This class tests both the SymbolIndex callable interface, and the + # Symbol Finder API. While this seems like it duplicates code, it's + # necessary to test both since they exercise different code paths: the + # Symbol Finder API uses a more efficient fast path. + self.finder = SymbolIndex(self.TEST_SYMS) + self.prog = Program() + self.prog.register_symbol_finder("test", self.finder, enable_index=0) + + def test_name_single(self): + for sym in self.TEST_SYMS: + if sym.name != "BB": + self.assertEqual([sym], self.finder(self.prog, sym.name, None, True)) + self.assertEqual(sym, self.prog.symbol(sym.name)) + self.assertEqual([sym], self.finder(self.prog, sym.name, None, False)) + self.assertEqual([sym], self.prog.symbols(sym.name)) + + def test_name_multiple(self): + multi_result = self.finder(self.prog, "BB", None, False) + self.assertEqual(2, len(multi_result)) + self.assertIn(self.BB, multi_result) + self.assertIn(self.BB2, multi_result) + + multi_result = self.prog.symbols("BB") + self.assertEqual(2, len(multi_result)) + self.assertIn(self.BB, multi_result) + self.assertIn(self.BB2, multi_result) + + single_result = self.finder(self.prog, "BB", None, True) + self.assertIn(single_result[0], (self.BB, self.BB2)) + + single_result = self.prog.symbol("BB") + self.assertIn(single_result, (self.BB, self.BB2)) + + def test_addr(self): + cases = { + 9: [], + 10: [self.AA], + 12: [self.AA, self.BB], + 13: [self.AA, self.CC], + 15: [self.CC], + 25: [], + 28: [self.DD], + 30: [self.DD], + 34: [self.EE, self.FF, self.GG], + 35: [self.FF, self.GG], + 36: [self.FF, self.BB2], + 43: [self.FF], + 44: [], + } + for address, expected in cases.items(): + # first, lookup by address alone and ensure we get all correct + # candidates: + multi_result = self.finder(self.prog, None, address, False) + self.assertEqual(len(expected), len(multi_result)) + self.assertTrue(all(e in multi_result for e in expected)) + multi_result = self.prog.symbols(address) + self.assertEqual(len(expected), len(multi_result)) + self.assertTrue(all(e in multi_result for e in expected)) + + # next, ensure that the single lookup works as expected: + if expected: + single_result = self.finder(self.prog, None, address, True) + self.assertEqual(1, len(single_result)) + self.assertIn(single_result[0], expected) + single_result = self.prog.symbol(address) + self.assertIn(single_result, expected) + + # Now, test that adding a name filter correctly filters: + # This cannot be tested with the Program.symbol() API since only + # one filter is allowed there. + for sym in expected: + self.assertEqual([sym], self.finder(self.prog, sym.name, address, True)) + self.assertEqual( + [sym], self.finder(self.prog, sym.name, address, False) + ) + + self.assertEqual([], self.finder(None, "MISSING", address, True)) + self.assertEqual([], self.finder(None, "MISSING", address, False)) + + def test_all(self): + result = self.finder(self.prog, None, None, True) + self.assertEqual(1, len(result)) + self.assertIn(result[0], self.TEST_SYMS) + result = self.finder(self.prog, None, None, False) + self.assertEqual(len(self.TEST_SYMS), len(result)) + for sym in self.TEST_SYMS: + self.assertIn(sym, result) + result = self.prog.symbols() + self.assertEqual(len(self.TEST_SYMS), len(result)) + for sym in self.TEST_SYMS: + self.assertIn(sym, result) + + def test_empty_index(self): + index = SymbolIndex([]) + # Check all the possible query patterns to ensure they can safely handle + # an empty list. + self.assertEqual([], index(self.prog, "name search", None, True)) + self.assertEqual([], index(self.prog, "name search", None, False)) + self.assertEqual([], index(self.prog, None, 0xFFFF, True)) + self.assertEqual([], index(self.prog, None, 0xFFFF, False)) + self.assertEqual([], index(self.prog, "name search", 0xFFFF, True)) + self.assertEqual([], index(self.prog, "name search", 0xFFFF, False))