From cc1c101ba37cce71f17797f91824b7afd5e47bed Mon Sep 17 00:00:00 2001 From: "Gabriele N. Tornetta" Date: Wed, 1 May 2024 14:31:41 +0100 Subject: [PATCH] chore: exclude non-user symbols from symbol DB (#9013) We prevent non-user symbols from being collected by the symbol database to reduce the work done by the client library, as well as the size of the uploaded payloads. ## Checklist - [x] Change(s) are motivated and described in the PR description - [x] Testing strategy is described if automated tests are not included in the PR - [x] Risks are described (performance impact, potential for breakage, maintainability) - [x] Change is maintainable (easy to change, telemetry, documentation) - [x] [Library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) are followed or label `changelog/no-changelog` is set - [x] Documentation is included (in-code, generated user docs, [public corp docs](https://github.com/DataDog/documentation/)) - [x] Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) - [x] If this PR changes the public interface, I've notified `@DataDog/apm-tees`. ## Reviewer Checklist - [ ] Title is accurate - [ ] All changes are related to the pull request's stated goal - [ ] Description motivates each change - [ ] Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - [ ] Testing strategy adequately addresses listed risks - [ ] Change is maintainable (easy to change, telemetry, documentation) - [ ] Release note makes sense to a user of the library - [ ] Author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - [ ] Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- ddtrace/internal/packages.py | 5 ++++ ddtrace/internal/symbol_db/symbols.py | 34 ++++++++++++++---------- tests/internal/symbol_db/test_symbols.py | 24 ++++++++--------- 3 files changed, 36 insertions(+), 27 deletions(-) diff --git a/ddtrace/internal/packages.py b/ddtrace/internal/packages.py index 2d8f1c5fd1e..fcec01a463b 100644 --- a/ddtrace/internal/packages.py +++ b/ddtrace/internal/packages.py @@ -238,6 +238,11 @@ def is_third_party(path: Path) -> bool: return package.name in _third_party_packages() +@cached() +def is_user_code(path: Path) -> bool: + return not (is_stdlib(path) or is_third_party(path)) + + @cached() def is_distribution_available(name: str) -> bool: """Determine if a distribution is available in the current environment.""" diff --git a/ddtrace/internal/symbol_db/symbols.py b/ddtrace/internal/symbol_db/symbols.py index d454e9eb8f5..9f66ffa3a86 100644 --- a/ddtrace/internal/symbol_db/symbols.py +++ b/ddtrace/internal/symbol_db/symbols.py @@ -3,7 +3,7 @@ from dataclasses import field import dis from enum import Enum -import http +from http.client import HTTPResponse from inspect import CO_VARARGS from inspect import CO_VARKEYWORDS from inspect import isasyncgenfunction @@ -31,7 +31,6 @@ from ddtrace.internal.logger import get_logger from ddtrace.internal.module import BaseModuleWatchdog from ddtrace.internal.module import origin -from ddtrace.internal.packages import is_stdlib from ddtrace.internal.runtime import get_runtime_id from ddtrace.internal.safety import _isinstance from ddtrace.internal.utils.cache import cached @@ -50,10 +49,10 @@ @cached() -def is_from_stdlib(obj: t.Any) -> t.Optional[bool]: +def is_from_user_code(obj: t.Any) -> t.Optional[bool]: try: path = origin(sys.modules[object.__getattribute__(obj, "__module__")]) - return is_stdlib(path) if path is not None else None + return packages.is_user_code(path) if path is not None else None except (AttributeError, KeyError): return None @@ -182,9 +181,6 @@ def _(cls, module: ModuleType, data: ScopeData): symbols = [] scopes = [] - if is_stdlib(module_origin): - return None - for alias, child in object.__getattribute__(module, "__dict__").items(): if _isinstance(child, ModuleType): # We don't want to traverse other modules. @@ -224,7 +220,7 @@ def _(cls, obj: type, data: ScopeData): return None data.seen.add(obj) - if is_from_stdlib(obj): + if not is_from_user_code(obj): return None symbols = [] @@ -347,7 +343,7 @@ def _(cls, f: FunctionType, data: ScopeData): return None data.seen.add(f) - if is_from_stdlib(f): + if not is_from_user_code(f): return None code = f.__dd_wrapped__.__code__ if hasattr(f, "__dd_wrapped__") else f.__code__ @@ -416,7 +412,7 @@ def _(cls, pr: property, data: ScopeData): data.seen.add(pr.fget) # TODO: These names don't match what is reported by the discovery. - if pr.fget is None or is_from_stdlib(pr.fget): + if pr.fget is None or not is_from_user_code(pr.fget): return None path = func_origin(t.cast(FunctionType, pr.fget)) @@ -477,7 +473,7 @@ def to_json(self) -> dict: "scopes": [_.to_json() for _ in self._scopes], } - def upload(self) -> http.client.HTTPResponse: + def upload(self) -> HTTPResponse: body, headers = multipart( parts=[ FormData( @@ -509,14 +505,24 @@ def __len__(self) -> int: def is_module_included(module: ModuleType) -> bool: + # Check if module name matches the include patterns if symdb_config._includes_re.match(module.__name__): return True - package = packages.module_to_package(module) - if package is None: + # Check if it is user code + module_origin = origin(module) + if module_origin is None: return False - return symdb_config._includes_re.match(package.name) is not None + if packages.is_user_code(module_origin): + return True + + # Check if the package name matches the include patterns + package = packages.filename_to_package(module_origin) + if package is not None and symdb_config._includes_re.match(package.name): + return True + + return False class SymbolDatabaseUploader(BaseModuleWatchdog): diff --git a/tests/internal/symbol_db/test_symbols.py b/tests/internal/symbol_db/test_symbols.py index 4c879b63e5c..a97f6c5bcee 100644 --- a/tests/internal/symbol_db/test_symbols.py +++ b/tests/internal/symbol_db/test_symbols.py @@ -203,20 +203,11 @@ def test_symbols_upload_enabled(): assert remoteconfig_poller.get_registered("LIVE_DEBUGGING_SYMBOL_DB") is not None -@pytest.mark.subprocess( - ddtrace_run=True, - env=dict( - DD_SYMBOL_DATABASE_UPLOAD_ENABLED="1", - _DD_SYMBOL_DATABASE_FORCE_UPLOAD="1", - DD_SYMBOL_DATABASE_INCLUDES="tests.submod.stuff", - ), -) +@pytest.mark.subprocess(ddtrace_run=True, env=dict(DD_SYMBOL_DATABASE_INCLUDES="tests.submod.stuff")) def test_symbols_force_upload(): from ddtrace.internal.symbol_db.symbols import ScopeType from ddtrace.internal.symbol_db.symbols import SymbolDatabaseUploader - assert SymbolDatabaseUploader.is_installed() - contexts = [] def _upload_context(context): @@ -224,11 +215,18 @@ def _upload_context(context): SymbolDatabaseUploader._upload_context = staticmethod(_upload_context) + SymbolDatabaseUploader.install() + + def get_scope(contexts, name): + for context in (_.to_json() for _ in contexts): + for scope in context["scopes"]: + if scope["name"] == name: + return scope + raise ValueError(f"Scope {name} not found in {contexts}") + import tests.submod.stuff # noqa import tests.submod.traced_stuff # noqa - (context,) = contexts - - (scope,) = context.to_json()["scopes"] + scope = get_scope(contexts, "tests.submod.stuff") assert scope["scope_type"] == ScopeType.MODULE assert scope["name"] == "tests.submod.stuff"