From 2300f140e5f97c4656f9f06d422710a3a0af9e5e Mon Sep 17 00:00:00 2001 From: wantsui Date: Wed, 4 Sep 2024 11:25:16 -0400 Subject: [PATCH 01/12] chore(docs): update the contributing testing instructions for specific tests (#10506) Replaces https://github.com/DataDog/dd-trace-py/pull/10500 . Thank you @taegyunkim for reviewing! In the current docs for running specific tests, it says: > Find the directive in the [CI config](https://github.com/DataDog/dd-trace-py/blob/32b88eadc00e05cd0bc2aec587f565cc89f71229/.circleci/config.yml#L664) whose pattern is equal to the suite name. Note the docker_services section of the directive, if present - these are the "suite services". This is referring to an older version of the repo, and assumes these lines are there: ``` gevent: <<: *contrib_job docker: - image: *ddtrace_dev_image - *testagent steps: - run_test: pattern: 'gevent' ``` However, if I change the branch to `main`, it no longer looks like this. Instead, it looks like this logic has been shifted to `config.templ.yml`: https://github.com/DataDog/dd-trace-py/blob/733a80eeb08c631967d3b17502cf0d6a9239c5cb/.circleci/config.templ.yml#L799 . The goal of this PR is to update the link. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- docs/contributing-testing.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/contributing-testing.rst b/docs/contributing-testing.rst index a6613ae4825..e485b3d2b2d 100644 --- a/docs/contributing-testing.rst +++ b/docs/contributing-testing.rst @@ -69,7 +69,7 @@ How do I run only the tests I care about? 2. Find the ``Venv`` in the `riotfile `_ whose ``command`` contains the tests you're interested in. Note the ``Venv``'s ``name`` - this is the "suite name". -3. Find the directive in the `CI config `_ +3. Find the directive in the file `./circleci/config.templ.yml `_ whose ``pattern`` is equal to the suite name. Note the ``docker_services`` section of the directive, if present - these are the "suite services". 4. Start the suite services, if applicable, with ``$ docker-compose up -d service1 service2``. From 3c813d6f0c7a014be833efd14215923cf8ed7422 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Wed, 4 Sep 2024 11:40:03 -0400 Subject: [PATCH 02/12] chore(profiling): test when WRAPT_DISABLED_EXTENSIONS is set (#10496) The venvs setting `WRAPT_DISABLED_EXTENSIONS` were never used in ci pipelines. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- .riot/requirements/1b559c7.txt | 28 -------- .riot/requirements/1c31b2e.txt | 25 ------- .riot/requirements/1e81c99.txt | 27 ------- .riot/requirements/1ef847d.txt | 27 ------- .riot/requirements/1fa5bd6.txt | 23 ------ .riot/requirements/d6b8465.txt | 23 ------ riotfile.py | 7 -- tests/profiling/collector/test_threading.py | 78 ++++++++++++++++----- 8 files changed, 61 insertions(+), 177 deletions(-) delete mode 100644 .riot/requirements/1b559c7.txt delete mode 100644 .riot/requirements/1c31b2e.txt delete mode 100644 .riot/requirements/1e81c99.txt delete mode 100644 .riot/requirements/1ef847d.txt delete mode 100644 .riot/requirements/1fa5bd6.txt delete mode 100644 .riot/requirements/d6b8465.txt diff --git a/.riot/requirements/1b559c7.txt b/.riot/requirements/1b559c7.txt deleted file mode 100644 index 26edc3cea32..00000000000 --- a/.riot/requirements/1b559c7.txt +++ /dev/null @@ -1,28 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.7 -# by the following command: -# -# pip-compile --config=pyproject.toml --no-annotate --resolver=backtracking .riot/requirements/1b559c7.in -# -attrs==23.2.0 -coverage[toml]==7.2.7 -exceptiongroup==1.2.1 -gunicorn==22.0.0 -hypothesis==6.45.0 -importlib-metadata==6.7.0 -iniconfig==2.0.0 -mock==5.1.0 -opentracing==2.4.0 -packaging==24.0 -pluggy==1.2.0 -py-cpuinfo==8.0.0 -pytest==7.4.4 -pytest-asyncio==0.21.1 -pytest-benchmark==4.0.0 -pytest-cov==4.1.0 -pytest-mock==3.11.1 -pytest-randomly==3.12.0 -sortedcontainers==2.4.0 -tomli==2.0.1 -typing-extensions==4.7.1 -zipp==3.15.0 diff --git a/.riot/requirements/1c31b2e.txt b/.riot/requirements/1c31b2e.txt deleted file mode 100644 index 2265ac9e1bf..00000000000 --- a/.riot/requirements/1c31b2e.txt +++ /dev/null @@ -1,25 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.10 -# by the following command: -# -# pip-compile --no-annotate .riot/requirements/1c31b2e.in -# -attrs==23.2.0 -coverage[toml]==7.5.4 -exceptiongroup==1.2.1 -gunicorn==22.0.0 -hypothesis==6.45.0 -iniconfig==2.0.0 -mock==5.1.0 -opentracing==2.4.0 -packaging==24.1 -pluggy==1.5.0 -py-cpuinfo==8.0.0 -pytest==8.2.2 -pytest-asyncio==0.21.1 -pytest-benchmark==4.0.0 -pytest-cov==5.0.0 -pytest-mock==3.14.0 -pytest-randomly==3.15.0 -sortedcontainers==2.4.0 -tomli==2.0.1 diff --git a/.riot/requirements/1e81c99.txt b/.riot/requirements/1e81c99.txt deleted file mode 100644 index bf861c2c04e..00000000000 --- a/.riot/requirements/1e81c99.txt +++ /dev/null @@ -1,27 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.8 -# by the following command: -# -# pip-compile --no-annotate .riot/requirements/1e81c99.in -# -attrs==23.2.0 -coverage[toml]==7.5.4 -exceptiongroup==1.2.1 -gunicorn==22.0.0 -hypothesis==6.45.0 -importlib-metadata==8.0.0 -iniconfig==2.0.0 -mock==5.1.0 -opentracing==2.4.0 -packaging==24.1 -pluggy==1.5.0 -py-cpuinfo==8.0.0 -pytest==8.2.2 -pytest-asyncio==0.21.1 -pytest-benchmark==4.0.0 -pytest-cov==5.0.0 -pytest-mock==3.14.0 -pytest-randomly==3.15.0 -sortedcontainers==2.4.0 -tomli==2.0.1 -zipp==3.19.2 diff --git a/.riot/requirements/1ef847d.txt b/.riot/requirements/1ef847d.txt deleted file mode 100644 index b19287549af..00000000000 --- a/.riot/requirements/1ef847d.txt +++ /dev/null @@ -1,27 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.9 -# by the following command: -# -# pip-compile --no-annotate .riot/requirements/1ef847d.in -# -attrs==23.2.0 -coverage[toml]==7.5.4 -exceptiongroup==1.2.1 -gunicorn==22.0.0 -hypothesis==6.45.0 -importlib-metadata==8.0.0 -iniconfig==2.0.0 -mock==5.1.0 -opentracing==2.4.0 -packaging==24.1 -pluggy==1.5.0 -py-cpuinfo==8.0.0 -pytest==8.2.2 -pytest-asyncio==0.21.1 -pytest-benchmark==4.0.0 -pytest-cov==5.0.0 -pytest-mock==3.14.0 -pytest-randomly==3.15.0 -sortedcontainers==2.4.0 -tomli==2.0.1 -zipp==3.19.2 diff --git a/.riot/requirements/1fa5bd6.txt b/.riot/requirements/1fa5bd6.txt deleted file mode 100644 index eb812e9c5e4..00000000000 --- a/.riot/requirements/1fa5bd6.txt +++ /dev/null @@ -1,23 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.12 -# by the following command: -# -# pip-compile --no-annotate .riot/requirements/1fa5bd6.in -# -attrs==23.2.0 -coverage[toml]==7.5.4 -gunicorn==22.0.0 -hypothesis==6.45.0 -iniconfig==2.0.0 -mock==5.1.0 -opentracing==2.4.0 -packaging==24.1 -pluggy==1.5.0 -py-cpuinfo==8.0.0 -pytest==8.2.2 -pytest-asyncio==0.21.1 -pytest-benchmark==4.0.0 -pytest-cov==5.0.0 -pytest-mock==3.14.0 -pytest-randomly==3.15.0 -sortedcontainers==2.4.0 diff --git a/.riot/requirements/d6b8465.txt b/.riot/requirements/d6b8465.txt deleted file mode 100644 index 2ecfd37168e..00000000000 --- a/.riot/requirements/d6b8465.txt +++ /dev/null @@ -1,23 +0,0 @@ -# -# This file is autogenerated by pip-compile with Python 3.11 -# by the following command: -# -# pip-compile --no-annotate .riot/requirements/d6b8465.in -# -attrs==23.2.0 -coverage[toml]==7.5.4 -gunicorn==22.0.0 -hypothesis==6.45.0 -iniconfig==2.0.0 -mock==5.1.0 -opentracing==2.4.0 -packaging==24.1 -pluggy==1.5.0 -py-cpuinfo==8.0.0 -pytest==8.2.2 -pytest-asyncio==0.21.1 -pytest-benchmark==4.0.0 -pytest-cov==5.0.0 -pytest-mock==3.14.0 -pytest-randomly==3.15.0 -sortedcontainers==2.4.0 diff --git a/riotfile.py b/riotfile.py index 3c6e02a1db3..d31956f961b 100644 --- a/riotfile.py +++ b/riotfile.py @@ -2799,13 +2799,6 @@ def select_pys(min_version=MIN_PYTHON_VERSION, max_version=MAX_PYTHON_VERSION): "pytest-randomly": latest, }, venvs=[ - Venv( - name="profile-wrapt-disabled", - pys=select_pys(), - env={ - "WRAPT_DISABLE_EXTENSIONS": "1", - }, - ), # Python 3.7 Venv( pys="3.7", diff --git a/tests/profiling/collector/test_threading.py b/tests/profiling/collector/test_threading.py index 5c55382f473..e143727a072 100644 --- a/tests/profiling/collector/test_threading.py +++ b/tests/profiling/collector/test_threading.py @@ -8,7 +8,6 @@ import pytest from ddtrace.profiling import recorder -from ddtrace.profiling.collector import _lock from ddtrace.profiling.collector import threading as collector_threading from . import test_collector @@ -592,33 +591,78 @@ def test_anonymous_lock(): assert release_event.frames[0] == (__file__.replace(".pyc", ".py"), release_lineno, "test_anonymous_lock", "") -def test_wrapt_c_ext_config(): - if os.environ.get("WRAPT_DISABLE_EXTENSIONS"): - assert _lock.WRAPT_C_EXT is False - else: - try: - import wrapt._wrappers as _w - except ImportError: - assert _lock.WRAPT_C_EXT is False - else: - assert _lock.WRAPT_C_EXT is True - del _w +@pytest.mark.subprocess( + env=dict(WRAPT_DISABLE_EXTENSIONS="True", DD_PROFILING_FILE_PATH=__file__), +) +def test_wrapt_disable_extensions(): + import os + import sys + import threading + + from ddtrace.profiling import recorder + from ddtrace.profiling.collector import _lock + from ddtrace.profiling.collector import threading as collector_threading + from tests.profiling.collector.utils import get_lock_linenos + from tests.profiling.collector.utils import init_linenos + + init_linenos(os.environ["DD_PROFILING_FILE_PATH"]) + + # WRAPT_DISABLE_EXTENSIONS is a flag that can be set to disable the C extension + # for wrapt. It's not set by default in dd-trace-py, but it can be set by + # users. This test checks that the collector works even if the flag is set. + assert os.environ.get("WRAPT_DISABLE_EXTENSIONS") + assert _lock.WRAPT_C_EXT is False + r = recorder.Recorder() with collector_threading.ThreadingLockCollector(r, capture_pct=100): - th_lock = threading.Lock() # !CREATE! test_wrapt_c_ext_config - with th_lock: # !ACQUIRE! !RELEASE! test_wrapt_c_ext_config + th_lock = threading.Lock() # !CREATE! test_wrapt_disable_extensions + with th_lock: # !ACQUIRE! !RELEASE! test_wrapt_disable_extensions pass - linenos = get_lock_linenos("test_wrapt_c_ext_config") + linenos = get_lock_linenos("test_wrapt_disable_extensions") assert len(r.events[collector_threading.ThreadingLockAcquireEvent]) == 1 acquire_event = r.events[collector_threading.ThreadingLockAcquireEvent][0] assert acquire_event.lock_name == "test_threading.py:{}:th_lock".format(linenos.create) - assert acquire_event.frames[0] == (__file__.replace(".pyc", ".py"), linenos.acquire, "test_wrapt_c_ext_config", "") + + expected_filename = os.environ["DD_PROFILING_FILE_PATH"].replace(".pyc", ".py") + + assert len(acquire_event.frames) > 0, "No frames found" + acquire_frame = acquire_event.frames[0] + + # This test is run in a subprocess, and doesn't show details about why it + # failed, so we add more details to the assert message. + assert expected_filename.endswith(acquire_frame.file_name), "Expected filename {} to end with {}".format( + expected_filename, acquire_frame.file_name + ) + assert acquire_frame.lineno == linenos.acquire, "Expected line number {}, got {}".format( + linenos.acquire, acquire_frame.lineno + ) + # As this test runs in a subprocess, the body of this function is simply + # placed in a temporary file, and we don't have a function name and it + # defaults to + assert acquire_frame.function_name == "", "Expected function name , got {}".format( + acquire_frame.function_name + ) + assert acquire_frame.class_name == "", "Expected class name '', got {}".format(acquire_frame.class_name) + assert len(r.events[collector_threading.ThreadingLockReleaseEvent]) == 1 release_event = r.events[collector_threading.ThreadingLockReleaseEvent][0] assert release_event.lock_name == "test_threading.py:{}:th_lock".format(linenos.create) release_lineno = linenos.acquire + (0 if sys.version_info >= (3, 10) else 1) - assert release_event.frames[0] == (__file__.replace(".pyc", ".py"), release_lineno, "test_wrapt_c_ext_config", "") + + assert len(release_event.frames) > 0, "No frames found" + release_frame = release_event.frames[0] + + assert expected_filename.endswith(release_frame.file_name), "Expected filename {} to end with {}".format( + expected_filename, release_frame.file_name + ) + assert release_frame.lineno == release_lineno, "Expected line number {}, got {}".format( + release_lineno, release_frame.lineno + ) + assert release_frame.function_name == "", "Expected function name , got {}".format( + release_frame.function_name + ) + assert release_frame.class_name == "", "Expected class name '', got {}".format(release_frame.class_name) def test_global_locks(): From 90c20ee18e399f51950a114464387a76cf909a07 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Wed, 4 Sep 2024 13:20:22 -0400 Subject: [PATCH 03/12] ci: revert "ci: migrate elasticsearch suite to GitLab (#10468)" (#10508) This reverts commit 9fa8f407d5a48f0dc9ae044b3a782da2b55cb414. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [ ] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- .circleci/config.templ.yml | 9 +++++++++ .gitlab/tests/contrib.yml | 18 ------------------ 2 files changed, 9 insertions(+), 18 deletions(-) diff --git a/.circleci/config.templ.yml b/.circleci/config.templ.yml index bf3d40f18e1..0a43a7e4ec4 100644 --- a/.circleci/config.templ.yml +++ b/.circleci/config.templ.yml @@ -691,6 +691,15 @@ jobs: - run_test: pattern: 'consul' + elasticsearch: + <<: *machine_executor + parallelism: 17 + steps: + - run_test: + pattern: 'elasticsearch' + snapshot: true + docker_services: 'elasticsearch opensearch' + django: <<: *machine_executor parallelism: 4 diff --git a/.gitlab/tests/contrib.yml b/.gitlab/tests/contrib.yml index 352a86a0dc7..34c314a901f 100644 --- a/.gitlab/tests/contrib.yml +++ b/.gitlab/tests/contrib.yml @@ -23,24 +23,6 @@ dogpile_cache: variables: SUITE_NAME: "dogpile_cache" -elasticsearch: - extends: .test_base_riot_snapshot - parallel: 17 - services: - - !reference [.test_base_riot_snapshot, services] - - name: registry.ddbuild.io/images/mirror/library/elasticsearch:7.17.23 - alias: elasticsearch - variables: - discovery.type: single-node - xpack.security.enabled: false - - name: registry.ddbuild.io/images/mirror/opensearchproject/opensearch:1.3.13 - alias: opensearch - variables: - DISABLE_SECURITY_PLUGIN: true - discovery.type: single-node - variables: - SUITE_NAME: "elasticsearch" - falcon: extends: .test_base_riot_snapshot variables: From 3d4d71025547fb33155ec541183b1d07863fc002 Mon Sep 17 00:00:00 2001 From: Taegyun Kim Date: Wed, 4 Sep 2024 14:15:14 -0400 Subject: [PATCH 04/12] chore(profiling): use ddtrace config instead of reading os.environ (#10499) ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- ddtrace/profiling/profiler.py | 51 ++++++++++++++++---------------- tests/profiling/test_profiler.py | 35 ++++++++++++++-------- 2 files changed, 48 insertions(+), 38 deletions(-) diff --git a/ddtrace/profiling/profiler.py b/ddtrace/profiling/profiler.py index eacf5ac50d6..1b33a7021c7 100644 --- a/ddtrace/profiling/profiler.py +++ b/ddtrace/profiling/profiler.py @@ -9,6 +9,7 @@ from typing import Union # noqa:F401 import ddtrace +from ddtrace import config from ddtrace.internal import agent from ddtrace.internal import atexit from ddtrace.internal import forksafe @@ -26,7 +27,7 @@ from ddtrace.profiling.collector import stack from ddtrace.profiling.collector import stack_event from ddtrace.profiling.collector import threading -from ddtrace.settings.profiling import config +from ddtrace.settings.profiling import config as profiling_config LOG = logging.getLogger(__name__) @@ -114,23 +115,23 @@ def __init__( version: Optional[str] = None, tracer: Any = ddtrace.tracer, api_key: Optional[str] = None, - agentless: bool = config.agentless, - _memory_collector_enabled: bool = config.memory.enabled, - _stack_collector_enabled: bool = config.stack.enabled, - _stack_v2_enabled: bool = config.stack.v2_enabled, - _lock_collector_enabled: bool = config.lock.enabled, - enable_code_provenance: bool = config.code_provenance, - endpoint_collection_enabled: bool = config.endpoint_collection, + agentless: bool = profiling_config.agentless, + _memory_collector_enabled: bool = profiling_config.memory.enabled, + _stack_collector_enabled: bool = profiling_config.stack.enabled, + _stack_v2_enabled: bool = profiling_config.stack.v2_enabled, + _lock_collector_enabled: bool = profiling_config.lock.enabled, + enable_code_provenance: bool = profiling_config.code_provenance, + endpoint_collection_enabled: bool = profiling_config.endpoint_collection, ): super().__init__() # User-supplied values self.url: Optional[str] = url - self.service: Optional[str] = service if service is not None else os.environ.get("DD_SERVICE") - self.tags: Dict[str, str] = tags if tags is not None else config.tags - self.env: Optional[str] = env if env is not None else os.environ.get("DD_ENV") - self.version: Optional[str] = version if version is not None else os.environ.get("DD_VERSION") + self.service: Optional[str] = service if service is not None else config.service + self.tags: Dict[str, str] = tags if tags is not None else profiling_config.tags + self.env: Optional[str] = env if env is not None else config.env + self.version: Optional[str] = version if version is not None else config.version self.tracer: Any = tracer - self.api_key: Optional[str] = api_key if api_key is not None else os.environ.get("DD_API_KEY") + self.api_key: Optional[str] = api_key if api_key is not None else config._dd_api_key self.agentless: bool = agentless self._memory_collector_enabled: bool = _memory_collector_enabled self._stack_collector_enabled: bool = _stack_collector_enabled @@ -145,7 +146,7 @@ def __init__( self._collectors_on_import: Any = None self._scheduler: Optional[Union[scheduler.Scheduler, scheduler.ServerlessScheduler]] = None self._lambda_function_name: Optional[str] = os.environ.get("AWS_LAMBDA_FUNCTION_NAME") - self._export_libdd_enabled: bool = config.export.libdd_enabled + self._export_libdd_enabled: bool = profiling_config.export.libdd_enabled self.__post_init__() @@ -161,7 +162,7 @@ def _build_default_exporters(self): # type: (...) -> List[exporter.Exporter] if not self._export_libdd_enabled: # If libdatadog support is enabled, we can skip this part - _OUTPUT_PPROF = config.output_pprof + _OUTPUT_PPROF = profiling_config.output_pprof if _OUTPUT_PPROF: # DEV: Import this only if needed to avoid importing protobuf # unnecessarily @@ -207,15 +208,15 @@ def _build_default_exporters(self): configured_features.append("lock") if self._memory_collector_enabled: configured_features.append("mem") - if config.heap.sample_size > 0: + if profiling_config.heap.sample_size > 0: configured_features.append("heap") if self._export_libdd_enabled: configured_features.append("exp_dd") else: configured_features.append("exp_py") - configured_features.append("CAP" + str(config.capture_pct)) - configured_features.append("MAXF" + str(config.max_frames)) + configured_features.append("CAP" + str(profiling_config.capture_pct)) + configured_features.append("MAXF" + str(profiling_config.max_frames)) self.tags.update({"profiler_config": "_".join(configured_features)}) endpoint_call_counter_span_processor = self.tracer._endpoint_call_counter_span_processor @@ -231,11 +232,11 @@ def _build_default_exporters(self): service=self.service, version=self.version, tags=self.tags, # type: ignore - max_nframes=config.max_frames, + max_nframes=profiling_config.max_frames, url=endpoint, - timeline_enabled=config.timeline_enabled, - output_filename=config.output_pprof, - sample_pool_capacity=config.sample_pool_capacity, + timeline_enabled=profiling_config.timeline_enabled, + output_filename=profiling_config.output_pprof, + sample_pool_capacity=profiling_config.sample_pool_capacity, ) ddup.start() @@ -243,13 +244,13 @@ def _build_default_exporters(self): except Exception as e: LOG.error("Failed to initialize libdd collector (%s), falling back to the legacy collector", e) self._export_libdd_enabled = False - config.export.libdd_enabled = False + profiling_config.export.libdd_enabled = False # also disable other features that might be enabled if self._stack_v2_enabled: LOG.error("Disabling stack_v2 as libdd collector failed to initialize") self._stack_v2_enabled = False - config.stack.v2_enabled = False + profiling_config.stack.v2_enabled = False # DEV: Import this only if needed to avoid importing protobuf # unnecessarily @@ -284,7 +285,7 @@ def __post_init__(self): # Do not limit the heap sample size as the number of events is relative to allocated memory anyway memalloc.MemoryHeapSampleEvent: None, }, - default_max_events=config.max_events, + default_max_events=profiling_config.max_events, ) if self._stack_collector_enabled: diff --git a/tests/profiling/test_profiler.py b/tests/profiling/test_profiler.py index d3281fe2aca..7e6d0ba1cee 100644 --- a/tests/profiling/test_profiler.py +++ b/tests/profiling/test_profiler.py @@ -43,13 +43,15 @@ def test_multiple_stop(): p.stop(flush=False) -@pytest.mark.parametrize( - "service_name_var", - ("DD_SERVICE",), +@pytest.mark.subprocess( + env=dict(DD_API_KEY="foobar", DD_SERVICE="foobar"), ) -def test_default_from_env(service_name_var, monkeypatch): - monkeypatch.setenv("DD_API_KEY", "foobar") - monkeypatch.setenv(service_name_var, "foobar") +def test_default_from_env(): + import pytest + + from ddtrace.profiling import profiler + from ddtrace.profiling.exporter import http + prof = profiler.Profiler() for exp in prof._profiler._scheduler.exporters: if isinstance(exp, http.PprofHTTPExporter): @@ -110,10 +112,15 @@ def test_profiler_init_float_division_regression(run_python_code_in_subprocess): assert err == b"" -def test_env_default(monkeypatch): - monkeypatch.setenv("DD_API_KEY", "foobar") - monkeypatch.setenv("DD_ENV", "staging") - monkeypatch.setenv("DD_VERSION", "123") +@pytest.mark.subprocess( + env=dict(DD_API_KEY="foobar", DD_ENV="staging", DD_VERSION="123"), +) +def test_env_default(): + import pytest + + from ddtrace.profiling import profiler + from ddtrace.profiling.exporter import http + prof = profiler.Profiler() assert prof.env == "staging" assert prof.version == "123" @@ -295,9 +302,11 @@ def test_env_endpoint_url(): _check_url(prof, "http://foobar:123", os.environ.get("DD_API_KEY")) -def test_env_endpoint_url_no_agent(monkeypatch): - monkeypatch.setenv("DD_SITE", "datadoghq.eu") - monkeypatch.setenv("DD_API_KEY", "123") +@pytest.mark.subprocess(env=dict(DD_SITE="datadoghq.eu", DD_API_KEY="123")) +def test_env_endpoint_url_no_agent(): + from ddtrace.profiling import profiler + from tests.profiling.test_profiler import _check_url + prof = profiler.Profiler() _check_url(prof, "http://localhost:8126", "123") From bcb939ed4abb5e844441cca30b0866be9d84aada Mon Sep 17 00:00:00 2001 From: Alberto Vara Date: Wed, 4 Sep 2024 20:29:32 +0200 Subject: [PATCH 05/12] chore(iast): clean the code (#10486) Standardize the import paths. Import the functions in alphabetical order. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [ ] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- ddtrace/appsec/_handlers.py | 2 +- ddtrace/appsec/_iast/_metrics.py | 4 +- .../appsec/_iast/_taint_tracking/__init__.py | 64 ++++++++++--------- .../appsec/iast/aspects/test_split_aspect.py | 8 +-- 4 files changed, 40 insertions(+), 38 deletions(-) diff --git a/ddtrace/appsec/_handlers.py b/ddtrace/appsec/_handlers.py index 265706fdefc..0dfe9f01325 100644 --- a/ddtrace/appsec/_handlers.py +++ b/ddtrace/appsec/_handlers.py @@ -375,8 +375,8 @@ def _on_django_patch(): def _custom_protobuf_getattribute(self, name): + from ddtrace.appsec._iast._taint_tracking import OriginType from ddtrace.appsec._iast._taint_tracking import taint_pyobject - from ddtrace.appsec._iast._taint_tracking._native.taint_tracking import OriginType from ddtrace.appsec._iast._taint_utils import taint_structure ret = type(self).__saved_getattr(self, name) diff --git a/ddtrace/appsec/_iast/_metrics.py b/ddtrace/appsec/_iast/_metrics.py index 03a24466ec2..4cd5e6a0d8f 100644 --- a/ddtrace/appsec/_iast/_metrics.py +++ b/ddtrace/appsec/_iast/_metrics.py @@ -81,7 +81,7 @@ def _set_iast_error_metric(msg: Text) -> None: @metric_verbosity(TELEMETRY_MANDATORY_VERBOSITY) def _set_metric_iast_instrumented_source(source_type): - from ._taint_tracking._native.taint_tracking import origin_to_str # noqa: F401 + from ._taint_tracking import origin_to_str telemetry.telemetry_writer.add_count_metric( TELEMETRY_NAMESPACE_TAG_IAST, "instrumented.source", 1, (("source_type", origin_to_str(source_type)),) @@ -102,7 +102,7 @@ def _set_metric_iast_instrumented_sink(vulnerability_type, counter=1): @metric_verbosity(TELEMETRY_INFORMATION_VERBOSITY) def _set_metric_iast_executed_source(source_type): - from ._taint_tracking._native.taint_tracking import origin_to_str # noqa: F401 + from ._taint_tracking import origin_to_str telemetry.telemetry_writer.add_count_metric( TELEMETRY_NAMESPACE_TAG_IAST, "executed.source", 1, (("source_type", origin_to_str(source_type)),) diff --git a/ddtrace/appsec/_iast/_taint_tracking/__init__.py b/ddtrace/appsec/_iast/_taint_tracking/__init__.py index a5ab8946bd0..8f4c46edf8f 100644 --- a/ddtrace/appsec/_iast/_taint_tracking/__init__.py +++ b/ddtrace/appsec/_iast/_taint_tracking/__init__.py @@ -63,50 +63,52 @@ __all__ = [ - "_convert_escaped_text_to_tainted_text", - "new_pyobject_id", - "setup", - "Source", "OriginType", + "Source", "TagMappingMode", "TaintRange", - "get_ranges", - "set_ranges", - "copy_ranges_from_strings", - "copy_and_shift_ranges_from_strings", - "are_all_text_all_ranges", - "shift_taint_range", - "shift_taint_ranges", - "get_range_by_hash", - "is_notinterned_notfasttainted_unicode", - "set_fast_tainted_if_notinterned_unicode", - "aspect_helpers", - "reset_context", - "initializer_size", - "active_map_addreses_size", - "create_context", - "str_to_origin", - "origin_to_str", - "common_replace", - "_aspect_ospathjoin", - "_aspect_split", - "_aspect_rsplit", - "_aspect_splitlines", + "_aspect_modulo", "_aspect_ospathbasename", "_aspect_ospathdirname", + "_aspect_ospathjoin", "_aspect_ospathnormcase", "_aspect_ospathsplit", - "_aspect_ospathsplitext", "_aspect_ospathsplitdrive", + "_aspect_ospathsplitext", "_aspect_ospathsplitroot", + "_aspect_rsplit", + "_aspect_split", + "_aspect_splitlines", + "_convert_escaped_text_to_tainted_text", "_format_aspect", + "active_map_addreses_size", + "are_all_text_all_ranges", "as_formatted_evidence", - "parse_params", - "set_ranges_on_splitted", - "num_objects_tainted", + "aspect_helpers", + "common_replace", + "copy_and_shift_ranges_from_strings", + "copy_ranges_from_strings", + "create_context", "debug_taint_map", + "get_range_by_hash", + "get_ranges", "iast_taint_log_error", - "_aspect_modulo", + "initializer_size", + "is_notinterned_notfasttainted_unicode", + "is_pyobject_tainted", + "new_pyobject_id", + "num_objects_tainted", + "origin_to_str", + "parse_params", + "reset_context", + "set_fast_tainted_if_notinterned_unicode", + "set_ranges", + "set_ranges_on_splitted", + "setup", + "shift_taint_range", + "shift_taint_ranges", + "str_to_origin", + "taint_pyobject", ] diff --git a/tests/appsec/iast/aspects/test_split_aspect.py b/tests/appsec/iast/aspects/test_split_aspect.py index 4506aec87f6..75229a09da7 100644 --- a/tests/appsec/iast/aspects/test_split_aspect.py +++ b/tests/appsec/iast/aspects/test_split_aspect.py @@ -4,17 +4,17 @@ import pytest from ddtrace.appsec._constants import IAST +from ddtrace.appsec._iast._taint_tracking import OriginType +from ddtrace.appsec._iast._taint_tracking import Source from ddtrace.appsec._iast._taint_tracking import TaintRange from ddtrace.appsec._iast._taint_tracking import _aspect_rsplit from ddtrace.appsec._iast._taint_tracking import _aspect_split from ddtrace.appsec._iast._taint_tracking import _aspect_splitlines from ddtrace.appsec._iast._taint_tracking import create_context +from ddtrace.appsec._iast._taint_tracking import get_ranges from ddtrace.appsec._iast._taint_tracking import reset_context +from ddtrace.appsec._iast._taint_tracking import set_ranges from ddtrace.appsec._iast._taint_tracking import taint_pyobject -from ddtrace.appsec._iast._taint_tracking._native.taint_tracking import OriginType -from ddtrace.appsec._iast._taint_tracking._native.taint_tracking import Source -from ddtrace.appsec._iast._taint_tracking._native.taint_tracking import get_ranges -from ddtrace.appsec._iast._taint_tracking._native.taint_tracking import set_ranges from tests.appsec.iast.aspects.test_aspect_helpers import _build_sample_range from tests.utils import override_env From ecde8bb3778233cf46d4f51ef048a3b822a334c3 Mon Sep 17 00:00:00 2001 From: Yun Kim <35776586+Yun-Kim@users.noreply.github.com> Date: Wed, 4 Sep 2024 14:47:39 -0400 Subject: [PATCH 06/12] fix(llmobs): keep custom trace filters in child processes (#10493) Fixes #10478. This PR ensures that any custom trace filters are not overwritten in forked processes. Previously on forked processes, the `LLMObs` service restarting on fork was overwriting all tracer filters with the llmobs internal filter. ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- ddtrace/llmobs/_llmobs.py | 5 ++- ...-fork-custom-filters-5ba642f9395a0ddd.yaml | 4 ++ tests/llmobs/test_llmobs_service.py | 37 +++++++++++++++++++ 3 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/fix-llmobs-fork-custom-filters-5ba642f9395a0ddd.yaml diff --git a/ddtrace/llmobs/_llmobs.py b/ddtrace/llmobs/_llmobs.py index cb119490c67..0ff1db4c528 100644 --- a/ddtrace/llmobs/_llmobs.py +++ b/ddtrace/llmobs/_llmobs.py @@ -90,7 +90,10 @@ def __init__(self, tracer=None): def _child_after_fork(self): self._llmobs_span_writer = self._llmobs_span_writer.recreate() self._trace_processor._span_writer = self._llmobs_span_writer - self.tracer.configure(settings={"FILTERS": [self._trace_processor]}) + tracer_filters = self.tracer._filters + if not any(isinstance(tracer_filter, LLMObsTraceProcessor) for tracer_filter in tracer_filters): + tracer_filters += [self._trace_processor] + self.tracer.configure(settings={"FILTERS": tracer_filters}) try: self._llmobs_span_writer.start() except ServiceStatusError: diff --git a/releasenotes/notes/fix-llmobs-fork-custom-filters-5ba642f9395a0ddd.yaml b/releasenotes/notes/fix-llmobs-fork-custom-filters-5ba642f9395a0ddd.yaml new file mode 100644 index 00000000000..62d86c61db3 --- /dev/null +++ b/releasenotes/notes/fix-llmobs-fork-custom-filters-5ba642f9395a0ddd.yaml @@ -0,0 +1,4 @@ +--- +fixes: + - | + LLM Observability: This fix resolves an issue where custom trace filters were being overwritten in forked processes. diff --git a/tests/llmobs/test_llmobs_service.py b/tests/llmobs/test_llmobs_service.py index 0f989edb537..6f6acfbe3bd 100644 --- a/tests/llmobs/test_llmobs_service.py +++ b/tests/llmobs/test_llmobs_service.py @@ -8,6 +8,7 @@ from ddtrace._trace.context import Context from ddtrace._trace.span import Span from ddtrace.ext import SpanTypes +from ddtrace.filters import TraceFilter from ddtrace.internal.service import ServiceStatus from ddtrace.llmobs import LLMObs as llmobs_service from ddtrace.llmobs._constants import INPUT_DOCUMENTS @@ -1471,3 +1472,39 @@ def test_llmobs_fork_create_span(monkeypatch): exit_code = os.WEXITSTATUS(status) assert exit_code == 12 llmobs_service.disable() + + +def test_llmobs_fork_custom_filter(monkeypatch): + """Test that forking a process correctly keeps any custom filters.""" + + class CustomFilter(TraceFilter): + def process_trace(self, trace): + return trace + + monkeypatch.setenv("_DD_LLMOBS_WRITER_INTERVAL", 5.0) + with mock.patch("ddtrace.internal.writer.HTTPWriter._send_payload"): + tracer = DummyTracer() + custom_filter = CustomFilter() + tracer.configure(settings={"FILTERS": [custom_filter]}) + llmobs_service.enable(_tracer=tracer, ml_app="test_app") + assert custom_filter in llmobs_service._instance.tracer._filters + pid = os.fork() + if pid: # parent + assert custom_filter in llmobs_service._instance.tracer._filters + assert any( + isinstance(tracer_filter, LLMObsTraceProcessor) + for tracer_filter in llmobs_service._instance.tracer._filters + ) + else: # child + assert custom_filter in llmobs_service._instance.tracer._filters + assert any( + isinstance(tracer_filter, LLMObsTraceProcessor) + for tracer_filter in llmobs_service._instance.tracer._filters + ) + llmobs_service.disable() + os._exit(12) + + _, status = os.waitpid(pid, 0) + exit_code = os.WEXITSTATUS(status) + assert exit_code == 12 + llmobs_service.disable() From 8f4828da322ed233557f00d7bd8686b353f42de5 Mon Sep 17 00:00:00 2001 From: erikayasuda <153395705+erikayasuda@users.noreply.github.com> Date: Wed, 4 Sep 2024 15:21:13 -0500 Subject: [PATCH 07/12] chore: update changelog for version 2.11.2 (#10424) - [x] update changelog for version 2.11.2 --- CHANGELOG.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index f4897e1ded5..5b63d77bb5e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,21 @@ Changelogs for versions not listed here can be found at https://github.com/DataDog/dd-trace-py/releases +--- + +## 2.11.2 + + +### New Features + +- openai: Introduces `model` tag for openai integration metrics for consistency with the OpenAI SaaS Integration. It has the same value as `openai.request.model`. + +### Bug Fixes + +- LLM Observability: Resolves an issue where LLM Observability spans were not being submitted in forked processes, such as when using `celery` or `gunicorn` workers. The LLM Observability writer thread now automatically restarts when a forked process is detected. +- openai: Fixes a bug where `asyncio.TimeoutError`s were not being propagated correctly from canceled OpenAI API requests. + + --- ## 2.11.1 From 408a26dfa4c7645d58902abb39a648d5e0d96cc7 Mon Sep 17 00:00:00 2001 From: Quinna Halim Date: Wed, 4 Sep 2024 17:37:02 -0400 Subject: [PATCH 08/12] ci: fix permissions for generate package versions action (#10489) ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- .github/workflows/generate-package-versions.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/generate-package-versions.yml b/.github/workflows/generate-package-versions.yml index b6ca257a43f..e4114528bb1 100644 --- a/.github/workflows/generate-package-versions.yml +++ b/.github/workflows/generate-package-versions.yml @@ -9,6 +9,8 @@ jobs: generate-package-versions: name: Generate package versions runs-on: ubuntu-latest + permissions: + pull-requests: write steps: - uses: actions/checkout@v4 From f59d8678b69b642769ad853b17677c63ddb50dfe Mon Sep 17 00:00:00 2001 From: Alberto Vara Date: Thu, 5 Sep 2024 10:00:08 +0200 Subject: [PATCH 09/12] chore(iast): fastapi path source (#10484) ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [ ] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- ddtrace/appsec/_iast/_patch.py | 25 ++++++++++++- .../fastapi/test_fastapi_appsec_iast.py | 37 +++++++++++++++++++ 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/ddtrace/appsec/_iast/_patch.py b/ddtrace/appsec/_iast/_patch.py index e0812cccfe2..94155f365a9 100644 --- a/ddtrace/appsec/_iast/_patch.py +++ b/ddtrace/appsec/_iast/_patch.py @@ -101,14 +101,14 @@ def _patched_fastapi_function(origin, original_func, instance, args, kwargs): if _is_iast_enabled(): try: from ._taint_tracking import is_pyobject_tainted - from ._taint_tracking import taint_pyobject from .processor import AppSecIastSpanProcessor if not AppSecIastSpanProcessor.is_span_analyzed(): return result if not is_pyobject_tainted(result): - from ._taint_tracking._native.taint_tracking import origin_to_str + from ._taint_tracking import origin_to_str + from ._taint_tracking import taint_pyobject return taint_pyobject( pyobject=result, source_name=origin_to_str(origin), source_value=result, source_origin=origin @@ -159,6 +159,8 @@ def _on_iast_fastapi_patch(): "Headers.get", functools.partial(if_iast_taint_returned_object_for, OriginType.HEADER), ) + try_wrap_function_wrapper("starlette.datastructures", "URL.__init__", _iast_instrument_starlette_url) + try_wrap_function_wrapper( "fastapi", "Header", @@ -170,6 +172,25 @@ def _on_iast_fastapi_patch(): _set_metric_iast_instrumented_source(OriginType.PATH_PARAMETER) +def _iast_instrument_starlette_url(wrapped, instance, args, kwargs): + from ddtrace.appsec._iast._taint_tracking import OriginType + from ddtrace.appsec._iast._taint_tracking import origin_to_str + from ddtrace.appsec._iast._taint_tracking import taint_pyobject + + def path(self) -> str: + return taint_pyobject( + self.components.path, + source_name=origin_to_str(OriginType.PATH), + source_value=self.components.path, + source_origin=OriginType.PATH, + ) + + instance.__class__.path = property(path) + wrapped(*args, **kwargs) + + _set_metric_iast_instrumented_source(OriginType.PATH) + + def _iast_instrument_starlette_scope(scope): from ddtrace.appsec._iast._taint_tracking import OriginType from ddtrace.appsec._iast._taint_tracking import taint_pyobject diff --git a/tests/contrib/fastapi/test_fastapi_appsec_iast.py b/tests/contrib/fastapi/test_fastapi_appsec_iast.py index 405c5c1b89a..70f0682a9d6 100644 --- a/tests/contrib/fastapi/test_fastapi_appsec_iast.py +++ b/tests/contrib/fastapi/test_fastapi_appsec_iast.py @@ -287,6 +287,43 @@ async def test_route(item_id): assert result["ranges_origin"] == "http.request.path.parameter" +def test_path_source(fastapi_application, client, tracer, test_spans): + @fastapi_application.get("/path_source/") + async def test_route(request: Request): + from ddtrace.appsec._iast._taint_tracking import get_tainted_ranges + from ddtrace.appsec._iast._taint_tracking import origin_to_str + + path = request.url.path + ranges_result = get_tainted_ranges(path) + + return JSONResponse( + { + "result": path, + "is_tainted": len(ranges_result), + "ranges_start": ranges_result[0].start, + "ranges_length": ranges_result[0].length, + "ranges_origin": origin_to_str(ranges_result[0].source.origin), + } + ) + + # test if asgi middleware is ok without any callback registered + core.reset_listeners(event_id="asgi.request.parse.body") + + with override_global_config(dict(_iast_enabled=True)), override_env(IAST_ENV): + # disable callback + _aux_appsec_prepare_tracer(tracer) + resp = client.get( + "/path_source/", + ) + assert resp.status_code == 200 + result = json.loads(get_response_body(resp)) + assert result["result"] == "/path_source/" + assert result["is_tainted"] == 1 + assert result["ranges_start"] == 0 + assert result["ranges_length"] == 13 + assert result["ranges_origin"] == "http.request.path" + + def test_fastapi_sqli_path_param(fastapi_application, client, tracer, test_spans): @fastapi_application.get("/index.html/{param_str}") async def test_route(param_str): From 68562ad54b6613442c5e74c1bf993826152d5fd7 Mon Sep 17 00:00:00 2001 From: Alberto Vara Date: Thu, 5 Sep 2024 10:51:22 +0200 Subject: [PATCH 10/12] chore(iast): urlib propagation (#10482) - Allow AST to taint urllib - Fix flask query_string in Flask < 2.0 - Refactor type hinting on `benchmarks/bm/iast_fixtures/str_methods.py` ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [ ] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- benchmarks/bm/iast_fixtures/str_methods.py | 188 ++++++++------- ddtrace/appsec/_handlers.py | 17 +- ddtrace/appsec/_iast/_ast/ast_patching.py | 7 +- ddtrace/appsec/_iast/_loader.py | 1 - .../appsec/_python_info/stdlib/__init__.py | 4 +- .../_python_info/stdlib/module_names_py310.py | 1 - .../_python_info/stdlib/module_names_py311.py | 1 - .../_python_info/stdlib/module_names_py312.py | 217 ++++++++++++++++++ .../_python_info/stdlib/module_names_py36.py | 1 - .../_python_info/stdlib/module_names_py37.py | 1 - .../_python_info/stdlib/module_names_py38.py | 1 - .../_python_info/stdlib/module_names_py39.py | 1 - tests/appsec/iast/aspects/conftest.py | 10 +- .../iast/aspects/test_other_patching.py | 59 +++++ .../contrib/django/test_django_appsec_iast.py | 6 +- tests/contrib/flask/test_flask_appsec_iast.py | 6 +- 16 files changed, 401 insertions(+), 120 deletions(-) create mode 100644 ddtrace/appsec/_python_info/stdlib/module_names_py312.py diff --git a/benchmarks/bm/iast_fixtures/str_methods.py b/benchmarks/bm/iast_fixtures/str_methods.py index d6f1a799533..813303e9d41 100644 --- a/benchmarks/bm/iast_fixtures/str_methods.py +++ b/benchmarks/bm/iast_fixtures/str_methods.py @@ -11,20 +11,15 @@ import random import re import threading -from typing import TYPE_CHECKING # noqa:F401 -from typing import Any # noqa:F401 -from typing import Optional # noqa:F401 -from typing import Text # noqa:F401 - - -if TYPE_CHECKING: # pragma: no cover - from typing import Callable # noqa:F401 - from typing import Dict # noqa:F401 - from typing import Generator # noqa:F401 - from typing import Iterable # noqa:F401 - from typing import List # noqa:F401 - from typing import Sequence # noqa:F401 - from typing import Tuple # noqa:F401 +from typing import Callable +from typing import Generator +from typing import Iterable +from typing import List +from typing import Optional +from typing import Sequence +from typing import Text +from typing import Tuple +import urllib.parse def methodcaller(*args, **kwargs): @@ -86,8 +81,8 @@ def do_tuple_string_assignment(a): return b, c, d, z -def uppercase_decorator(function): # type: (Callable) -> Callable - def wrapper(a, b): # type: (str, str) -> str +def uppercase_decorator(function: Callable) -> Callable: + def wrapper(a: str, b: str) -> Text: func = function(a, b) return func.upper() @@ -95,7 +90,7 @@ def wrapper(a, b): # type: (str, str) -> str @uppercase_decorator -def do_add_and_uppercase(a, b): # type: (str, str) -> str +def do_add_and_uppercase(a: Text, b: Text) -> Text: return a + b @@ -186,8 +181,7 @@ def do_decode(s, encoding="utf-8", errors="strict"): # type: (bytes, str, str) return s.decode(encoding, errors) -def do_translate(s, translate_dict): - # type: (str, dict) -> str +def do_translate(s: str, translate_dict: dict) -> str: return s.translate(translate_dict) @@ -195,31 +189,29 @@ def do_decode_simple(s: bytes) -> str: return s.decode() -def do_encode(s, encoding="utf-8", errors="strict"): - # type: (str, str, str) -> bytes +def do_encode(s: str, encoding: str = "utf-8", errors: str = "strict") -> bytes: return s.encode(encoding, errors) -def do_encode_from_dict(s, encoding="utf-8", errors="strict"): - # type: (str, str, str) -> bytes +def do_encode_from_dict(s: str, encoding: str = "utf-8", errors: str = "strict") -> bytes: my_dictionary = {} my_dictionary["test"] = s return my_dictionary.get("test", "").encode(encoding, errors) -def do_str_to_bytes(s): # type: (str) -> bytes +def do_str_to_bytes(s: str) -> bytes: return bytes(s, encoding="utf-8") -def do_str_to_bytearray(s): # type: (str) -> bytearray +def do_str_to_bytearray(s: str) -> bytearray: return bytearray(s, encoding="utf-8") -def do_str_to_bytes_to_bytearray(s): # type: (str) -> bytearray +def do_str_to_bytes_to_bytearray(s: str) -> bytearray: return bytearray(bytes(s, encoding="utf-8")) -def do_str_to_bytes_to_bytearray_to_str(s): # type: (str) -> str +def do_str_to_bytes_to_bytearray_to_str(s: str) -> str: return str(bytearray(bytes(s, encoding="utf-8")), encoding="utf-8") @@ -237,15 +229,15 @@ def do_bytearray_extend(ba: bytearray, b: bytearray) -> bytearray: return ba -def do_repr(b: Any) -> str: +def do_repr(b) -> Text: return repr(b) -def do_str(b: Any) -> str: +def do_str(b) -> Text: return str(b) -def do_bytes(b: Any) -> bytes: +def do_bytes(b) -> bytes: return bytes(b) @@ -703,15 +695,15 @@ class SampleClass(object): TIME_ZONE = "UTC/UTM" @staticmethod - def commonprefix(first, *args): # type: (Text, List[Any]) -> Sequence + def commonprefix(first: Text, *args: List) -> Sequence: return os.path.commonprefix(list([first]) + list(args)) -def do_join_tuple_unpack_with_call_no_replace(): # type: () -> Sequence +def do_join_tuple_unpack_with_call_no_replace() -> Sequence: return SampleClass.commonprefix("/usr/bin", *("/usr/local/lib", "/usr/lib")) -def do_join_tuple_unpack_with_call_with_methods(zoneinfo_root): # type: (str) -> bool +def do_join_tuple_unpack_with_call_with_methods(zoneinfo_root: str) -> bool: simple = SampleClass() return os.path.exists(os.path.join(zoneinfo_root, *(simple.TIME_ZONE.split("/")))) @@ -792,18 +784,18 @@ def function_next(self): # type: () -> str return output -def func_iter_sum(a): # type: (str) -> List - out = [] # type: List[str] +def func_iter_sum(a: str) -> List[str]: + out: List[str] = [] # type out += a, a return out -def get_random_string_module_encode(allowed_chars): # type: (str) -> List[str] +def get_random_string_module_encode(allowed_chars: str) -> List[str]: result = ("%s%s%s" % ("a", "b", "c")).encode("utf-8") return [allowed_chars for i in result] -def get_random_string_join(mystring): # type: (str) -> str +def get_random_string_join(mystring: str) -> Text: return "".join(mystring for i in ["1", "2"]) @@ -821,19 +813,19 @@ def get_random_string_seed( return "".join(random.choice(allowed_chars) for i in range(length)) -def mark_safe(a): # type: (str) -> str +def mark_safe(a: str) -> Text: return a -def conditional_escape(a): # type: (str) -> str +def conditional_escape(a: str) -> Text: return a -def format_html(a, args): # type: (str, *Tuple) -> str +def format_html(a: str, args: Tuple) -> str: return a.join(args) -def format_html_join(attrs, args_generator=None): # type: (str, List[str]) -> str +def format_html_join(attrs: str, args_generator: List[str] = None) -> str: if args_generator is None: args_generator = ["a", "b", "c"] @@ -841,7 +833,7 @@ def format_html_join(attrs, args_generator=None): # type: (str, List[str]) -> s return result -def get_wrapped_repeat_text_with_join(wrapper): # type: (Callable) -> Callable +def get_wrapped_repeat_text_with_join(wrapper: Callable) -> Callable: @wrapper def repeat_text_with_join(text, times=2): # type: (str, int) -> str # Use the join to confirm that we use a string-propagation method @@ -850,100 +842,95 @@ def repeat_text_with_join(text, times=2): # type: (str, int) -> str return repeat_text_with_join -def do_format_with_positional_parameter( - template, # type: Text - parameter, # type: Any -): # type: (...) -> Text +def do_format_with_positional_parameter(template: str, parameter: str) -> str: return template.format(parameter) def do_format_with_named_parameter( - template, # type: Text - value, # type: Any -): # type: (...) -> Text + template, + value, +): return template.format(key=value) -def mapper(taint_range): # type: (Any) -> Text +def mapper(taint_range) -> Text: return taint_range.origin.parameter_name -def do_args_kwargs_1(format_string, *args_safe, **kwargs_safe): # type: (str, Any, Any) -> str +def do_args_kwargs_1(format_string, *args_safe, **kwargs_safe) -> Text: return format_string.format(*args_safe, **kwargs_safe) -def do_args_kwargs_2(format_string, *args_safe, **kwargs_safe): # type: (str, Any, Any) -> str +def do_args_kwargs_2(format_string, *args_safe, **kwargs_safe) -> Text: return format_string.format("1", *args_safe, **kwargs_safe) -def do_args_kwargs_3(format_string, *args_safe, **kwargs_safe): # type: (str, Any, Any) -> str +def do_args_kwargs_3(format_string, *args_safe, **kwargs_safe) -> Text: return format_string.format("1", "2", *args_safe, **kwargs_safe) -def do_args_kwargs_4(format_string, *args_safe, **kwargs_safe): # type: (str, Any, Any) -> str +def do_args_kwargs_4(format_string, *args_safe, **kwargs_safe) -> Text: return format_string.format("1", "2", test_kwarg=3, *args_safe, **kwargs_safe) -def do_format_key_error(param1): # type: (str, Dict[str, Any]) -> str +def do_format_key_error(param1: str) -> Text: return "Test {param1}, {param2}".format(param1=param1) # noqa:F524 -def do_join(s, iterable): - # type: (str, Iterable) -> str +def do_join(s, iterable: Iterable) -> Text: return s.join(iterable) -def do_join_args_kwargs(s, *args, **kwargs): - # type: (str, Iterable) -> str +def do_join_args_kwargs(s, *args, **kwargs) -> Text: return s.join(*args, **kwargs) -def do_join_tuple(mystring): # type: (str) -> str +def do_join_tuple(mystring: str) -> Text: mystring = mystring gen = tuple(mystring + _ for _ in ["1", "2", "3"]) return "".join(gen) -def do_join_set(mystring): # type: (str) -> str +def do_join_set(mystring: str) -> Text: mystring = mystring gen = {mystring + _ for _ in ["1", "2", "3"]} return "".join(gen) -def do_join_generator(mystring): # type: (str) -> str +def do_join_generator(mystring: str) -> Text: mystring = mystring gen = (mystring for _ in ["1", "2", "3"]) return "".join(gen) -def do_join_generator_2(mystring): # type: (str) -> str - def parts(): # type: () -> Generator +def do_join_generator_2(mystring: str) -> Text: + def parts() -> Generator: for i in ["x", "y", "z"]: yield i return mystring.join(parts()) -def do_join_generator_and_title(mystring): # type: (str) -> str +def do_join_generator_and_title(mystring: str) -> Text: mystring = mystring.title() gen = (mystring for _ in ["1", "2", "3"]) return "".join(gen) -def do_modulo(template, parameter): # type: (Text, Any) -> Text +def do_modulo(template: Text, parameter) -> Text: return template % parameter -def do_replace(text, old, new, count=-1): # type: (Text, Text, Text, int) -> Text +def do_replace(text: Text, old: Text, new: Text, count=-1) -> Text: return text.replace(old, new, count) def do_slice( - text, # type: str - first, # type: Optional[int] - second, # type: Optional[int] - third, # type: Optional[int] -): # type: (...) -> str + text: str, + first: Optional[int], + second: Optional[int], + third: Optional[int], +) -> Text: # CAVEAT: the following code is duplicate on purpose (also present in production code), # because it needs to expose the slicing in order to be patched correctly. @@ -965,9 +952,7 @@ def do_slice( return key_lambda_map[cases_key](text) -def do_slice_complex( - s, # type: str -): +def do_slice_complex(s: str): import struct unpack = struct.unpack @@ -983,9 +968,7 @@ def do_slice_complex( return acc -def do_slice_negative( - s, # type: str -): +def do_slice_negative(s: str): return s[-16:] @@ -997,7 +980,7 @@ def __repr__(self): # type: () -> str return self.str_param + " a" -def do_format_fill(a): # type: (Any) -> str +def do_format_fill(a) -> Text: return "{:10}".format(a) @@ -1015,78 +998,78 @@ def do_slice_condition(s: str, first, second): return s[first : second or 0] -def do_namedtuple(s: Text) -> Any: +def do_namedtuple(s: Text): PathInfo = namedtuple("PathInfo", "name surname") my_string = PathInfo(name=s, surname=None) return my_string -def do_split_no_args(s): # type: (str) -> List[str] +def do_split_no_args(s: str) -> List[str]: return s.split() -def do_rsplit_no_args(s): # type: (str) -> List[str] +def do_rsplit_no_args(s: str) -> List[str]: return s.rsplit() -def do_split_maxsplit(s, maxsplit=-1): # type: (str, int) -> List[str] +def do_split_maxsplit(s: str, maxsplit: int = -1) -> List[str]: return s.split(maxsplit=maxsplit) -def do_rsplit_maxsplit(s, maxsplit=-1): # type: (str, int) -> List[str] +def do_rsplit_maxsplit(s: str, maxsplit: int = -1) -> List[str]: return s.rsplit(maxsplit=maxsplit) -def do_split_separator(s, separator): # type: (str, str) -> List[str] +def do_split_separator(s: str, separator: str) -> List[str]: return s.split(separator) -def do_rsplit_separator(s, separator): # type: (str, str) -> List[str] +def do_rsplit_separator(s: str, separator: str) -> List[str]: return s.rsplit(separator) -def do_split_separator_and_maxsplit(s, separator, maxsplit): # type: (str, str, int) -> List[str] +def do_split_separator_and_maxsplit(s: str, separator: str, maxsplit: int) -> List[str]: return s.split(separator, maxsplit) -def do_rsplit_separator_and_maxsplit(s, separator, maxsplit): # type: (str, str, int) -> List[str] +def do_rsplit_separator_and_maxsplit(s: str, separator: str, maxsplit: int) -> List[str]: return s.rsplit(separator, maxsplit) -def do_splitlines_no_arg(s): # type: (str) -> List[str] +def do_splitlines_no_arg(s: str) -> List[str]: return s.splitlines() -def do_splitlines_keepends(s, keepends): # type: (str, bool) -> List[str] +def do_splitlines_keepends(s, keepends): return s.splitlines(keepends=keepends) -def do_partition(s, sep): # type: (str, str) -> Tuple[str, str, str] +def do_partition(s, sep): return s.partition(sep) -def do_zfill(s, width): # type: (str, int) -> str +def do_zfill(s: str, width: int) -> str: return s.zfill(width) -def do_rsplit(s, sep, maxsplit=-1): # type: (str, str, int) -> List[str] +def do_rsplit(s, sep, maxsplit=-1): return s.rsplit(sep, maxsplit) -def do_rstrip_2(s): # type: (str) -> str +def do_rstrip_2(s) -> Text: return s.rstrip() -def do_index(c: str, i: int) -> str: +def do_index(c: str, i: int) -> Text: return c[i] -def do_methodcaller(s, func, *args): # type: (str, str, Any) -> str +def do_methodcaller(s, func, *args): func_method = operator.methodcaller(func, *args) return func_method(s) -def get_http_headers(header_key): # type: (str) -> bytes +def get_http_headers(header_key: str) -> bytes: RANDOM_PORT = 0 server = StoppableHTTPServer(("127.0.0.1", RANDOM_PORT), WebServerHandler) thread = threading.Thread(None, server.run) @@ -1242,3 +1225,14 @@ def _preprocess_lexer_input(text): text += "\n" return text + + +def index_lower_add(url): + i = 4 + scheme, url = url[:i].lower(), url[i + 1 :] + return scheme, url + + +def urlib_urlsplit(text): + results = urllib.parse.urlsplit(text) + return results diff --git a/ddtrace/appsec/_handlers.py b/ddtrace/appsec/_handlers.py index 0dfe9f01325..8b43e4ae855 100644 --- a/ddtrace/appsec/_handlers.py +++ b/ddtrace/appsec/_handlers.py @@ -194,22 +194,22 @@ def _on_request_init(wrapped, instance, args, kwargs): if _is_iast_enabled(): try: from ddtrace.appsec._iast._taint_tracking import OriginType + from ddtrace.appsec._iast._taint_tracking import origin_to_str from ddtrace.appsec._iast._taint_tracking import taint_pyobject from ddtrace.appsec._iast.processor import AppSecIastSpanProcessor if not AppSecIastSpanProcessor.is_span_analyzed(): return - # TODO: instance.query_string = ?? instance.query_string = taint_pyobject( pyobject=instance.query_string, - source_name=OriginType.QUERY, + source_name=origin_to_str(OriginType.QUERY), source_value=instance.query_string, source_origin=OriginType.QUERY, ) instance.path = taint_pyobject( pyobject=instance.path, - source_name=OriginType.PATH, + source_name=origin_to_str(OriginType.PATH), source_value=instance.path, source_origin=OriginType.PATH, ) @@ -246,7 +246,9 @@ def _on_flask_patch(flask_version): ) _set_metric_iast_instrumented_source(OriginType.HEADER) - try_wrap_function_wrapper("werkzeug.wrappers.request", "Request.__init__", _on_request_init) + if flask_version >= (2, 0, 0): + # instance.query_string: raising an error on werkzeug/_internal.py "AttributeError: read only property" + try_wrap_function_wrapper("werkzeug.wrappers.request", "Request.__init__", _on_request_init) _set_metric_iast_instrumented_source(OriginType.PATH) _set_metric_iast_instrumented_source(OriginType.QUERY) @@ -283,6 +285,7 @@ def _on_django_func_wrapped(fn_args, fn_kwargs, first_arg_expected_type, *_): if _is_iast_enabled() and fn_args and isinstance(fn_args[0], first_arg_expected_type): from ddtrace.appsec._iast._taint_tracking import OriginType # noqa: F401 from ddtrace.appsec._iast._taint_tracking import is_pyobject_tainted + from ddtrace.appsec._iast._taint_tracking import origin_to_str from ddtrace.appsec._iast._taint_tracking import taint_pyobject from ddtrace.appsec._iast._taint_utils import taint_structure from ddtrace.appsec._iast.processor import AppSecIastSpanProcessor @@ -298,7 +301,7 @@ def _on_django_func_wrapped(fn_args, fn_kwargs, first_arg_expected_type, *_): if not is_pyobject_tainted(getattr(http_req, "_body", None)): http_req._body = taint_pyobject( http_req.body, - source_name="body", + source_name=origin_to_str(OriginType.BODY), source_value=http_req.body, source_origin=OriginType.BODY, ) @@ -309,13 +312,13 @@ def _on_django_func_wrapped(fn_args, fn_kwargs, first_arg_expected_type, *_): ) http_req.path_info = taint_pyobject( http_req.path_info, - source_name="path", + source_name=origin_to_str(OriginType.PATH), source_value=http_req.path, source_origin=OriginType.PATH, ) http_req.environ["PATH_INFO"] = taint_pyobject( http_req.environ["PATH_INFO"], - source_name="path", + source_name=origin_to_str(OriginType.PATH), source_value=http_req.path, source_origin=OriginType.PATH, ) diff --git a/ddtrace/appsec/_iast/_ast/ast_patching.py b/ddtrace/appsec/_iast/_ast/ast_patching.py index e46334f2d7f..08a95ca5a68 100644 --- a/ddtrace/appsec/_iast/_ast/ast_patching.py +++ b/ddtrace/appsec/_iast/_ast/ast_patching.py @@ -129,10 +129,15 @@ def _should_iast_patch(module_name: Text) -> bool: # diff = max_allow - max_deny # return diff > 0 or (diff == 0 and not _in_python_stdlib_or_third_party(module_name)) if module_name.lower().startswith(IAST_ALLOWLIST): + log.debug("IAST: allowing %s. it's in the IAST_ALLOWLIST", module_name) return True if module_name.lower().startswith(IAST_DENYLIST): + log.debug("IAST: denying %s. it's in the IAST_DENYLIST", module_name) return False - return not _in_python_stdlib(module_name) + if _in_python_stdlib(module_name): + log.debug("IAST: denying %s. it's in the _in_python_stdlib", module_name) + return False + return True def visit_ast( diff --git a/ddtrace/appsec/_iast/_loader.py b/ddtrace/appsec/_iast/_loader.py index f211191430b..e81024932f6 100644 --- a/ddtrace/appsec/_iast/_loader.py +++ b/ddtrace/appsec/_iast/_loader.py @@ -8,7 +8,6 @@ log = get_logger(__name__) - IS_IAST_ENABLED = _is_iast_enabled() diff --git a/ddtrace/appsec/_python_info/stdlib/__init__.py b/ddtrace/appsec/_python_info/stdlib/__init__.py index 84d0af7cfa3..a040e57f859 100644 --- a/ddtrace/appsec/_python_info/stdlib/__init__.py +++ b/ddtrace/appsec/_python_info/stdlib/__init__.py @@ -13,8 +13,10 @@ from .module_names_py39 import STDLIB_MODULE_NAMES elif version_info < (3, 11, 0): from .module_names_py310 import STDLIB_MODULE_NAMES -else: +elif version_info < (3, 12, 0): from .module_names_py311 import STDLIB_MODULE_NAMES +else: + from .module_names_py312 import STDLIB_MODULE_NAMES def _stdlib_for_python_version(): # type: () -> set diff --git a/ddtrace/appsec/_python_info/stdlib/module_names_py310.py b/ddtrace/appsec/_python_info/stdlib/module_names_py310.py index 338f8073aa4..c99fec456f6 100644 --- a/ddtrace/appsec/_python_info/stdlib/module_names_py310.py +++ b/ddtrace/appsec/_python_info/stdlib/module_names_py310.py @@ -196,7 +196,6 @@ "typing", "unicodedata", "unittest", - "urllib", "uu", "uuid", "venv", diff --git a/ddtrace/appsec/_python_info/stdlib/module_names_py311.py b/ddtrace/appsec/_python_info/stdlib/module_names_py311.py index 47030a17d73..81937c5e5c2 100644 --- a/ddtrace/appsec/_python_info/stdlib/module_names_py311.py +++ b/ddtrace/appsec/_python_info/stdlib/module_names_py311.py @@ -196,7 +196,6 @@ "typing", "unicodedata", "unittest", - "urllib", "uu", "uuid", "venv", diff --git a/ddtrace/appsec/_python_info/stdlib/module_names_py312.py b/ddtrace/appsec/_python_info/stdlib/module_names_py312.py new file mode 100644 index 00000000000..81937c5e5c2 --- /dev/null +++ b/ddtrace/appsec/_python_info/stdlib/module_names_py312.py @@ -0,0 +1,217 @@ +STDLIB_MODULE_NAMES = { + "__future__", + "_ast", + "_compression", + "_thread", + "abc", + "aifc", + "argparse", + "array", + "ast", + "asynchat", + "asyncio", + "asyncore", + "atexit", + "audioop", + "base64", + "bdb", + "binascii", + "bisect", + "builtins", + "bz2", + "cProfile", + "calendar", + "cgi", + "cgitb", + "chunk", + "cmath", + "cmd", + "code", + "codecs", + "codeop", + "collections", + "colorsys", + "compileall", + "concurrent", + "configparser", + "contextlib", + "contextvars", + "copy", + "copyreg", + "crypt", + "csv", + "ctypes", + "curses", + "dataclasses", + "datetime", + "dbm", + "decimal", + "difflib", + "dis", + "distutils", + "doctest", + "email", + "encodings", + "ensurepip", + "enum", + "errno", + "faulthandler", + "fcntl", + "filecmp", + "fileinput", + "fnmatch", + "fractions", + "ftplib", + "functools", + "gc", + "getopt", + "getpass", + "gettext", + "glob", + "graphlib", + "grp", + "gzip", + "hashlib", + "heapq", + "hmac", + "html", + "http", + "idlelib", + "imaplib", + "imghdr", + "imp", + "importlib", + "inspect", + "io", + "ipaddress", + "itertools", + "json", + "keyword", + "lib2to3", + "linecache", + "locale", + "logging", + "lzma", + "mailbox", + "mailcap", + "marshal", + "math", + "mimetypes", + "mmap", + "modulefinder", + "msilib", + "msvcrt", + "multiprocessing", + "netrc", + "nis", + "nntplib", + "ntpath", + "numbers", + "opcode", + "operator", + "optparse", + "os", + "ossaudiodev", + "pathlib", + "pdb", + "pickle", + "pickletools", + "pipes", + "pkgutil", + "platform", + "plistlib", + "poplib", + "posix", + "posixpath", + "pprint", + "profile", + "pstats", + "pty", + "pwd", + "py_compile", + "pyclbr", + "pydoc", + "queue", + "quopri", + "random", + "re", + "readline", + "reprlib", + "resource", + "rlcompleter", + "runpy", + "sched", + "secrets", + "select", + "selectors", + "shelve", + "shlex", + "shutil", + "signal", + "site", + "smtpd", + "smtplib", + "sndhdr", + "socket", + "socketserver", + "spwd", + "sqlite3", + "sre", + "sre_compile", + "sre_constants", + "sre_parse", + "ssl", + "stat", + "statistics", + "string", + "stringprep", + "struct", + "subprocess", + "sunau", + "symtable", + "sys", + "sysconfig", + "syslog", + "tabnanny", + "tarfile", + "telnetlib", + "tempfile", + "termios", + "test", + "textwrap", + "threading", + "time", + "timeit", + "tkinter", + "token", + "tokenize", + "tomllib", + "trace", + "traceback", + "tracemalloc", + "tty", + "turtle", + "turtledemo", + "types", + "typing", + "unicodedata", + "unittest", + "uu", + "uuid", + "venv", + "warnings", + "wave", + "weakref", + "webbrowser", + "winreg", + "winsound", + "wsgiref", + "xdrlib", + "xml", + "xmlrpc", + "zipapp", + "zipfile", + "zipimport", + "zlib", + "zoneinfo", +} diff --git a/ddtrace/appsec/_python_info/stdlib/module_names_py36.py b/ddtrace/appsec/_python_info/stdlib/module_names_py36.py index c4eb7c60e66..80a7b2e0d22 100644 --- a/ddtrace/appsec/_python_info/stdlib/module_names_py36.py +++ b/ddtrace/appsec/_python_info/stdlib/module_names_py36.py @@ -199,7 +199,6 @@ "typing", "unicodedata", "unittest", - "urllib", "uu", "uuid", "venv", diff --git a/ddtrace/appsec/_python_info/stdlib/module_names_py37.py b/ddtrace/appsec/_python_info/stdlib/module_names_py37.py index 0f989b2e988..1af04d31499 100644 --- a/ddtrace/appsec/_python_info/stdlib/module_names_py37.py +++ b/ddtrace/appsec/_python_info/stdlib/module_names_py37.py @@ -200,7 +200,6 @@ "typing", "unicodedata", "unittest", - "urllib", "uu", "uuid", "venv", diff --git a/ddtrace/appsec/_python_info/stdlib/module_names_py38.py b/ddtrace/appsec/_python_info/stdlib/module_names_py38.py index 1e5be0bd3ba..a5d5b7b81d4 100644 --- a/ddtrace/appsec/_python_info/stdlib/module_names_py38.py +++ b/ddtrace/appsec/_python_info/stdlib/module_names_py38.py @@ -199,7 +199,6 @@ "typing", "unicodedata", "unittest", - "urllib", "uu", "uuid", "venv", diff --git a/ddtrace/appsec/_python_info/stdlib/module_names_py39.py b/ddtrace/appsec/_python_info/stdlib/module_names_py39.py index 6bdc9002bf1..0f862f079e9 100644 --- a/ddtrace/appsec/_python_info/stdlib/module_names_py39.py +++ b/ddtrace/appsec/_python_info/stdlib/module_names_py39.py @@ -198,7 +198,6 @@ "typing", "unicodedata", "unittest", - "urllib", "uu", "uuid", "venv", diff --git a/tests/appsec/iast/aspects/conftest.py b/tests/appsec/iast/aspects/conftest.py index 8edfe78128d..efbd78b63a0 100644 --- a/tests/appsec/iast/aspects/conftest.py +++ b/tests/appsec/iast/aspects/conftest.py @@ -4,9 +4,14 @@ import pytest from ddtrace.appsec._iast import oce +from ddtrace.appsec._iast._ast.ast_patching import _should_iast_patch from ddtrace.appsec._iast._ast.ast_patching import astpatch_module +class IastTestException(Exception): + pass + + def _iast_patched_module_and_patched_source(module_name, new_module_object=False): module = importlib.import_module(module_name) module_path, patched_source = astpatch_module(module) @@ -17,7 +22,10 @@ def _iast_patched_module_and_patched_source(module_name, new_module_object=False def _iast_patched_module(module_name, new_module_object=False): - module, _ = _iast_patched_module_and_patched_source(module_name, new_module_object) + if _should_iast_patch(module_name): + module, _ = _iast_patched_module_and_patched_source(module_name, new_module_object) + else: + raise IastTestException(f"IAST Test Error: module {module_name} was excluded") return module diff --git a/tests/appsec/iast/aspects/test_other_patching.py b/tests/appsec/iast/aspects/test_other_patching.py index 053beb8162e..d392fdb7c4b 100644 --- a/tests/appsec/iast/aspects/test_other_patching.py +++ b/tests/appsec/iast/aspects/test_other_patching.py @@ -1,3 +1,7 @@ +import sys + +import pytest + from ddtrace.appsec._iast._taint_tracking import OriginType from ddtrace.appsec._iast._taint_tracking import Source from ddtrace.appsec._iast._taint_tracking import TaintRange @@ -62,3 +66,58 @@ def test_preprocess_lexer_input(): assert get_tainted_ranges(result) == [ TaintRange(0, 22, Source("first_element", "print('Hello, world!')", OriginType.PARAMETER)) ] + + +def test_index_lower_add(): + text = "http://localhost:8000/api/articles?param1=value1" + string_input = taint_pyobject( + pyobject=text, source_name="first_element", source_value=text, source_origin=OriginType.PARAMETER + ) + result_scheme, result_url = mod.index_lower_add(string_input) + assert result_url == "//localhost:8000/api/articles?param1=value1" + assert result_scheme == "http" + + assert get_tainted_ranges(result_scheme) == [TaintRange(0, 4, Source("first_element", text, OriginType.PARAMETER))] + assert get_tainted_ranges(result_url) == [ + TaintRange(0, 43, Source("first_element", "print('Hello, world!')", OriginType.PARAMETER)) + ] + + +@pytest.mark.skipif( + sys.version_info >= (3, 11), + reason="Python 3.11 and 3.12 raise TypeError: don't know how to" "disassemble _lru_cache_wrapper objects", +) +def test_urlib_parse_patching(): + _iast_patched_module("urllib.parse") + + import dis + import urllib.parse + + bytecode = dis.Bytecode(urllib.parse.urlsplit) + assert "add_aspect" in bytecode.codeobj.co_names + if sys.version_info > (3, 9): + assert "replace_aspect" in bytecode.codeobj.co_names + assert "slice_aspect" in bytecode.codeobj.co_names + assert "lower_aspect" in bytecode.codeobj.co_names + + +def test_urlib_parse_propagation(): + _iast_patched_module("urllib.parse") + mod = _iast_patched_module("benchmarks.bm.iast_fixtures.str_methods") + + text = "http://localhost:8000/api/articles?param1=value1" + string_input = taint_pyobject( + pyobject=text, source_name="first_element", source_value=text, source_origin=OriginType.PARAMETER + ) + + result = mod.urlib_urlsplit(string_input) + assert result.path == "/api/articles" + assert result.scheme == "http" + assert result.netloc == "localhost:8000" + + assert get_tainted_ranges(result.path) == [TaintRange(0, 13, Source("first_element", text, OriginType.PARAMETER))] + if sys.version_info > (3, 9): + assert get_tainted_ranges(result.scheme) == [ + TaintRange(0, 4, Source("first_element", text, OriginType.PARAMETER)) + ] + assert get_tainted_ranges(result.netloc) == [TaintRange(0, 14, Source("first_element", text, OriginType.PARAMETER))] diff --git a/tests/contrib/django/test_django_appsec_iast.py b/tests/contrib/django/test_django_appsec_iast.py index 35f195d1d45..7f07dab17bb 100644 --- a/tests/contrib/django/test_django_appsec_iast.py +++ b/tests/contrib/django/test_django_appsec_iast.py @@ -470,7 +470,7 @@ def test_django_tainted_user_agent_iast_enabled_sqli_http_body(client, test_span line, hash_value = get_line_and_hash("iast_enabled_sqli_http_body", VULN_SQL_INJECTION, filename=TEST_FILE) - assert loaded["sources"] == [{"origin": "http.request.body", "name": "body", "value": "master"}] + assert loaded["sources"] == [{"origin": "http.request.body", "name": "http.request.body", "value": "master"}] assert loaded["vulnerabilities"][0]["type"] == VULN_SQL_INJECTION assert loaded["vulnerabilities"][0]["hash"] == hash_value assert loaded["vulnerabilities"][0]["evidence"] == { @@ -547,7 +547,7 @@ def test_django_command_injection(client, test_spans, tracer): line, hash_value = get_line_and_hash("iast_command_injection", VULN_CMDI, filename=TEST_FILE) assert loaded["sources"] == [ - {"name": "body", "origin": "http.request.body", "pattern": "abcdef", "redacted": True} + {"name": "http.request.body", "origin": "http.request.body", "pattern": "abcdef", "redacted": True} ] assert loaded["vulnerabilities"][0]["type"] == VULN_CMDI assert loaded["vulnerabilities"][0]["hash"] == hash_value @@ -576,7 +576,7 @@ def test_django_header_injection(client, test_spans, tracer): line, hash_value = get_line_and_hash("iast_header_injection", VULN_HEADER_INJECTION, filename=TEST_FILE) - assert loaded["sources"] == [{"origin": "http.request.body", "name": "body", "value": "master"}] + assert loaded["sources"] == [{"origin": "http.request.body", "name": "http.request.body", "value": "master"}] assert loaded["vulnerabilities"][0]["type"] == VULN_HEADER_INJECTION assert loaded["vulnerabilities"][0]["hash"] == hash_value assert loaded["vulnerabilities"][0]["evidence"] == { diff --git a/tests/contrib/flask/test_flask_appsec_iast.py b/tests/contrib/flask/test_flask_appsec_iast.py index 4835947b079..f8db3ffc051 100644 --- a/tests/contrib/flask/test_flask_appsec_iast.py +++ b/tests/contrib/flask/test_flask_appsec_iast.py @@ -553,7 +553,7 @@ def sqli_10(): json_data = request.json else: json_data = json.loads(request.data) - value = json_data.get("body") + value = json_data.get("json_body") assert value == "master" assert is_pyobject_tainted(value) query = add_aspect(add_aspect("SELECT tbl_name FROM sqlite_", value), " WHERE tbl_name LIKE 'password'") @@ -569,7 +569,7 @@ def sqli_10(): ) ): resp = self.client.post( - "/sqli/body/", data=json.dumps(dict(body="master")), content_type="application/json" + "/sqli/body/", data=json.dumps(dict(json_body="master")), content_type="application/json" ) assert resp.status_code == 200 @@ -577,7 +577,7 @@ def sqli_10(): assert root_span.get_metric(IAST.ENABLED) == 1.0 loaded = json.loads(root_span.get_tag(IAST.JSON)) - assert loaded["sources"] == [{"name": "body", "origin": "http.request.body", "value": "master"}] + assert loaded["sources"] == [{"name": "json_body", "origin": "http.request.body", "value": "master"}] line, hash_value = get_line_and_hash( "test_flask_request_body", From 7adb4e6b3da452409f73a39fd873178b736c8033 Mon Sep 17 00:00:00 2001 From: Alberto Vara Date: Thu, 5 Sep 2024 12:10:16 +0200 Subject: [PATCH 11/12] chore(iast): report instrumented metric for flask (#10505) ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [ ] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- ddtrace/appsec/_handlers.py | 3 +++ tests/appsec/integrations/test_flask_telemetry.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/ddtrace/appsec/_handlers.py b/ddtrace/appsec/_handlers.py index 8b43e4ae855..dcd2db2844d 100644 --- a/ddtrace/appsec/_handlers.py +++ b/ddtrace/appsec/_handlers.py @@ -253,6 +253,9 @@ def _on_flask_patch(flask_version): _set_metric_iast_instrumented_source(OriginType.PATH) _set_metric_iast_instrumented_source(OriginType.QUERY) + # Instrumented on _ddtrace.appsec._asm_request_context._on_wrapped_view + _set_metric_iast_instrumented_source(OriginType.PATH_PARAMETER) + try_wrap_function_wrapper( "werkzeug.wrappers.request", "Request.get_data", diff --git a/tests/appsec/integrations/test_flask_telemetry.py b/tests/appsec/integrations/test_flask_telemetry.py index 0539aaf6618..fac88e443d7 100644 --- a/tests/appsec/integrations/test_flask_telemetry.py +++ b/tests/appsec/integrations/test_flask_telemetry.py @@ -24,11 +24,12 @@ def test_flask_instrumented_metrics(telemetry_writer): metrics_result = telemetry_writer._namespace._metrics_data metrics_source_tags_result = [metric._tags[0][1] for metric in metrics_result["generate-metrics"]["iast"].values()] - assert len(metrics_source_tags_result) == 6 + assert len(metrics_source_tags_result) == 7 assert origin_to_str(OriginType.HEADER_NAME) in metrics_source_tags_result assert origin_to_str(OriginType.HEADER) in metrics_source_tags_result assert origin_to_str(OriginType.PARAMETER) in metrics_source_tags_result assert origin_to_str(OriginType.PATH) in metrics_source_tags_result + assert origin_to_str(OriginType.PATH_PARAMETER) in metrics_source_tags_result assert origin_to_str(OriginType.QUERY) in metrics_source_tags_result assert origin_to_str(OriginType.BODY) in metrics_source_tags_result From 9acdd8d0e673438b3dc806ee72a257e9958942ba Mon Sep 17 00:00:00 2001 From: William Conti <58711692+wconti27@users.noreply.github.com> Date: Thu, 5 Sep 2024 10:39:14 -0400 Subject: [PATCH 12/12] feat(dsm): implement datastreams monitoring schema support (#10451) ## Checklist - [x] PR author has checked that all the criteria below are met - The PR description includes an overview of the change - The PR description articulates the motivation for the change - The change includes tests OR the PR description describes a testing strategy - The PR description notes risks associated with the change, if any - Newly-added code is easy to change - The change follows the [library release note guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html) - The change includes or references documentation updates if necessary - Backport labels are set (if [applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)) ## Reviewer Checklist - [x] Reviewer has checked that all the criteria below are met - Title is accurate - All changes are related to the pull request's stated goal - Avoids breaking [API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces) changes - Testing strategy adequately addresses listed risks - Newly-added code is easy to change - Release note makes sense to a user of the library - If necessary, author has acknowledged and discussed the performance implications of this PR as reported in the benchmarks PR comment - Backport labels are set in a manner that is consistent with the [release branch maintenance policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting) --- ddtrace/ext/schema.py | 7 ++ ddtrace/internal/datastreams/processor.py | 18 ++++ .../internal/datastreams/schemas/__init__.py | 0 .../internal/datastreams/schemas/schema.py | 4 + .../datastreams/schemas/schema_builder.py | 101 ++++++++++++++++++ .../datastreams/schemas/schema_iterator.py | 7 ++ .../datastreams/schemas/schema_sampler.py | 25 +++++ ...ring-schema-tracking-784ba3f9f3c368d2.yaml | 4 + tests/.suitespec.json | 3 +- tests/datastreams/schemas/__init__.py | 0 .../schemas/test_schema_builder.py | 52 +++++++++ .../schemas/test_schema_sampler.py | 32 ++++++ 12 files changed, 252 insertions(+), 1 deletion(-) create mode 100644 ddtrace/ext/schema.py create mode 100644 ddtrace/internal/datastreams/schemas/__init__.py create mode 100644 ddtrace/internal/datastreams/schemas/schema.py create mode 100644 ddtrace/internal/datastreams/schemas/schema_builder.py create mode 100644 ddtrace/internal/datastreams/schemas/schema_iterator.py create mode 100644 ddtrace/internal/datastreams/schemas/schema_sampler.py create mode 100644 releasenotes/notes/add-datastreams-monitoring-schema-tracking-784ba3f9f3c368d2.yaml create mode 100644 tests/datastreams/schemas/__init__.py create mode 100644 tests/datastreams/schemas/test_schema_builder.py create mode 100644 tests/datastreams/schemas/test_schema_sampler.py diff --git a/ddtrace/ext/schema.py b/ddtrace/ext/schema.py new file mode 100644 index 00000000000..6ce4dd17b71 --- /dev/null +++ b/ddtrace/ext/schema.py @@ -0,0 +1,7 @@ +SCHEMA_DEFINITION = "schema.definition" +SCHEMA_WEIGHT = "schema.weight" +SCHEMA_TYPE = "schema.type" +SCHEMA_ID = "schema.id" +SCHEMA_TOPIC = "schema.topic" +SCHEMA_OPERATION = "schema.operation" +SCHEMA_NAME = "schema.name" diff --git a/ddtrace/internal/datastreams/processor.py b/ddtrace/internal/datastreams/processor.py index c3424976806..5afdb07c9a2 100644 --- a/ddtrace/internal/datastreams/processor.py +++ b/ddtrace/internal/datastreams/processor.py @@ -34,6 +34,8 @@ from .encoding import decode_var_int_64 from .encoding import encode_var_int_64 from .fnv import fnv1_64 +from .schemas.schema_builder import SchemaBuilder +from .schemas.schema_sampler import SchemaSampler def gzip_compress(payload): @@ -141,6 +143,7 @@ def __init__(self, agent_url, interval=None, timeout=1.0, retry_attempts=3): self._lock = Lock() self._current_context = threading.local() self._enabled = True + self._schema_samplers: Dict[str, SchemaSampler] = {} self._flush_stats_with_backoff = fibonacci_backoff_with_jitter( attempts=retry_attempts, @@ -383,6 +386,21 @@ def set_checkpoint(self, tags, now_sec=None, payload_size=0, span=None): ctx.set_checkpoint(tags, now_sec=now_sec, payload_size=payload_size, span=span) return ctx + def try_sample_schema(self, topic): + now_ms = time.time() * 1000 + + sampler = self._schema_samplers.setdefault(topic, SchemaSampler()) + return sampler.try_sample(now_ms) + + def can_sample_schema(self, topic): + now_ms = time.time() * 1000 + + sampler = self._schema_samplers.setdefault(topic, SchemaSampler()) + return sampler.can_sample(now_ms) + + def get_schema(self, schema_name, iterator): + return SchemaBuilder.get_schema(schema_name, iterator) + class DataStreamsCtx: def __init__(self, processor, hash_value, pathway_start_sec, current_edge_start_sec): diff --git a/ddtrace/internal/datastreams/schemas/__init__.py b/ddtrace/internal/datastreams/schemas/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/ddtrace/internal/datastreams/schemas/schema.py b/ddtrace/internal/datastreams/schemas/schema.py new file mode 100644 index 00000000000..bcfbb436f8c --- /dev/null +++ b/ddtrace/internal/datastreams/schemas/schema.py @@ -0,0 +1,4 @@ +class Schema: + def __init__(self, definition, _id): + self.definition = definition + self.id = _id diff --git a/ddtrace/internal/datastreams/schemas/schema_builder.py b/ddtrace/internal/datastreams/schemas/schema_builder.py new file mode 100644 index 00000000000..a21aae3b367 --- /dev/null +++ b/ddtrace/internal/datastreams/schemas/schema_builder.py @@ -0,0 +1,101 @@ +from dataclasses import asdict +from dataclasses import dataclass +from dataclasses import field +from functools import lru_cache +import json +from typing import Any +from typing import Dict +from typing import List +from typing import Optional + +from ..fnv import fnv1_64 +from .schema import Schema +from .schema_iterator import SchemaIterator + + +class SchemaBuilder: + max_depth = 10 + max_properties = 1000 + CACHE = lru_cache(maxsize=32) + properties = 0 + + def __init__(self, iterator): + self.schema: OpenApiSchema = OpenApiSchema() + self.iterator: SchemaIterator = iterator + + def add_property(self, schema_name, field_name, is_array, type_, description, ref, format_, enum_values): + if self.properties >= self.max_properties: + return False + self.properties += 1 + _property = OpenApiSchema.Property(type_, description, ref, format_, enum_values, None) + if is_array: + _property = OpenApiSchema.Property("array", None, None, None, None, _property) + self.schema.components.schemas[schema_name].properties[field_name] = _property + return True + + def build(self): + self.iterator.iterate_over_schema(self) + no_nones = convert_to_json_compatible(self.schema) + definition = json.dumps(no_nones, default=lambda o: o.__dict__) + _id = str(fnv1_64(definition.encode("utf-8"))) + return Schema(definition, _id) + + def should_extract_schema(self, schema_name, depth): + if depth > self.max_depth: + return False + if schema_name in self.schema.components.schemas: + return False + self.schema.components.schemas[schema_name] = OpenApiSchema.Schema() + return True + + @staticmethod + def get_schema(schema_name, iterator): + if schema_name not in SchemaBuilder.CACHE: + SchemaBuilder.CACHE[schema_name] = SchemaBuilder(iterator).build() + return SchemaBuilder.CACHE[schema_name] + + +@dataclass +class OpenApiSchema: + openapi: str = "3.0.0" + components: "OpenApiSchema.Components" = field(default_factory=lambda: OpenApiSchema.Components()) + + @dataclass + class Property: + type: str + description: Optional[str] = None + ref: Optional[str] = field(default=None, metadata={"name": "$ref"}) + format: Optional[str] = None + enum_values: Optional[List[str]] = field(default=None, metadata={"name": "enum"}) + items: Optional["OpenApiSchema.Property"] = None + + @dataclass + class Schema: + type: str = "object" + properties: Dict[str, "OpenApiSchema.Property"] = field(default_factory=dict) + + @dataclass + class Components: + schemas: Dict[str, "OpenApiSchema.Schema"] = field(default_factory=dict) + + +def convert_to_json_compatible(obj: Any) -> Any: + if isinstance(obj, list): + return [convert_to_json_compatible(item) for item in obj if item is not None] + elif isinstance(obj, dict): + return {convert_key(k): convert_to_json_compatible(v) for k, v in obj.items() if v is not None} + elif hasattr(obj, "__dataclass_fields__"): + return {convert_key(k): convert_to_json_compatible(v) for k, v in asdict(obj).items() if v is not None} + return obj + + +def convert_key(key: str) -> str: + if key == "ref": + return "$ref" + elif key == "enum_values": + return "enum" + elif key == "_property": + return "property" + elif key == "_id": + return "id" + return key diff --git a/ddtrace/internal/datastreams/schemas/schema_iterator.py b/ddtrace/internal/datastreams/schemas/schema_iterator.py new file mode 100644 index 00000000000..c83a1372f22 --- /dev/null +++ b/ddtrace/internal/datastreams/schemas/schema_iterator.py @@ -0,0 +1,7 @@ +import abc + + +class SchemaIterator: + @abc.abstractmethod + def iterate_over_schema(self, builder): + pass diff --git a/ddtrace/internal/datastreams/schemas/schema_sampler.py b/ddtrace/internal/datastreams/schemas/schema_sampler.py new file mode 100644 index 00000000000..b9ff0a613b8 --- /dev/null +++ b/ddtrace/internal/datastreams/schemas/schema_sampler.py @@ -0,0 +1,25 @@ +import threading + + +class SchemaSampler: + SAMPLE_INTERVAL_MILLIS = 30 * 1000 + + def __init__(self): + self.weight = 0 + self.last_sample_millis = 0 + self.lock = threading.Lock() + + def try_sample(self, current_time_millis): + if current_time_millis >= self.last_sample_millis + self.SAMPLE_INTERVAL_MILLIS: + with self.lock: + if current_time_millis >= self.last_sample_millis + self.SAMPLE_INTERVAL_MILLIS: + self.last_sample_millis = current_time_millis + weight = self.weight + self.weight = 0 + return weight + return 0 + + def can_sample(self, current_time_millis): + with self.lock: + self.weight += 1 + return current_time_millis >= self.last_sample_millis + self.SAMPLE_INTERVAL_MILLIS diff --git a/releasenotes/notes/add-datastreams-monitoring-schema-tracking-784ba3f9f3c368d2.yaml b/releasenotes/notes/add-datastreams-monitoring-schema-tracking-784ba3f9f3c368d2.yaml new file mode 100644 index 00000000000..733b759ebd9 --- /dev/null +++ b/releasenotes/notes/add-datastreams-monitoring-schema-tracking-784ba3f9f3c368d2.yaml @@ -0,0 +1,4 @@ +--- +features: + - | + Datastreams Monitoring (DSM): Add support for schema tracking. diff --git a/tests/.suitespec.json b/tests/.suitespec.json index 66a0bbc76de..ab0c087bd37 100644 --- a/tests/.suitespec.json +++ b/tests/.suitespec.json @@ -175,7 +175,8 @@ ], "datastreams": [ "ddtrace/internal/datastreams/*", - "ddtrace/data_streams.py" + "ddtrace/data_streams.py", + "ddtrace/ext/schema.py" ], "debugging": [ "ddtrace/debugging/*", diff --git a/tests/datastreams/schemas/__init__.py b/tests/datastreams/schemas/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/tests/datastreams/schemas/test_schema_builder.py b/tests/datastreams/schemas/test_schema_builder.py new file mode 100644 index 00000000000..4e81b7ef791 --- /dev/null +++ b/tests/datastreams/schemas/test_schema_builder.py @@ -0,0 +1,52 @@ +import json + +from ddtrace.internal.datastreams.schemas.schema_builder import SchemaBuilder +from ddtrace.internal.datastreams.schemas.schema_iterator import SchemaIterator + + +class Iterator(SchemaIterator): + def iterate_over_schema(self, builder: SchemaBuilder): + builder.add_property("person", "name", False, "string", "name of the person", None, None, None) + builder.add_property("person", "phone_numbers", True, "string", None, None, None, None) + builder.add_property("person", "person_name", False, "string", None, None, None, None) + builder.add_property("person", "address", False, "object", None, "#/components/schemas/address", None, None) + builder.add_property("address", "zip", False, "number", None, None, "int", None) + builder.add_property("address", "street", False, "string", None, None, None, None) + + +def test_schema_is_converted_correctly_to_json(): + builder = SchemaBuilder(Iterator()) + + should_extract_person = builder.should_extract_schema("person", 0) + should_extract_address = builder.should_extract_schema("address", 1) + should_extract_person2 = builder.should_extract_schema("person", 0) + should_extract_too_deep = builder.should_extract_schema("city", 11) + schema = builder.build() + + expected_schema = { + "components": { + "schemas": { + "person": { + "properties": { + "name": {"description": "name of the person", "type": "string"}, + "phone_numbers": {"items": {"type": "string"}, "type": "array"}, + "person_name": {"type": "string"}, + "address": {"$ref": "#/components/schemas/address", "type": "object"}, + }, + "type": "object", + }, + "address": { + "properties": {"zip": {"format": "int", "type": "number"}, "street": {"type": "string"}}, + "type": "object", + }, + } + }, + "openapi": "3.0.0", + } + + assert json.loads(schema.definition) == expected_schema + assert schema.id == "9510078321201428652" + assert should_extract_person + assert should_extract_address + assert not should_extract_person2 + assert not should_extract_too_deep diff --git a/tests/datastreams/schemas/test_schema_sampler.py b/tests/datastreams/schemas/test_schema_sampler.py new file mode 100644 index 00000000000..b4075284232 --- /dev/null +++ b/tests/datastreams/schemas/test_schema_sampler.py @@ -0,0 +1,32 @@ +from ddtrace.internal.datastreams.schemas.schema_sampler import SchemaSampler + + +def test_schema_sampler_samples_with_correct_weights(): + currentTimeMillis = 100000 + sampler = SchemaSampler() + + can_sample1 = sampler.can_sample(currentTimeMillis) + weight1 = sampler.try_sample(currentTimeMillis) + + can_sample2 = sampler.can_sample(currentTimeMillis + 1000) + weight2 = sampler.try_sample(currentTimeMillis + 1000) + + can_sample3 = sampler.can_sample(currentTimeMillis + 2000) + weight3 = sampler.try_sample(currentTimeMillis + 2000) + + can_sample4 = sampler.can_sample(currentTimeMillis + 30000) + weight4 = sampler.try_sample(currentTimeMillis + 30000) + + can_sample5 = sampler.can_sample(currentTimeMillis + 30001) + weight5 = sampler.try_sample(currentTimeMillis + 30001) + + assert can_sample1 + assert weight1 == 1 + assert not can_sample2 + assert weight2 == 0 + assert not can_sample3 + assert weight3 == 0 + assert can_sample4 + assert weight4 == 3 + assert not can_sample5 + assert weight5 == 0