Skip to content

Commit

Permalink
chore(internal): start crashtracker when enabled (#9865)
Browse files Browse the repository at this point in the history
WIP: initial implementation, needs some manual and automated testing
changes.

## Checklist
- [x] PR author has checked that all the criteria below are met
- The PR description includes an overview of the change
- The PR description articulates the motivation for the change
- The change includes tests OR the PR description describes a testing
strategy
- The PR description notes risks associated with the change, if any
- Newly-added code is easy to change
- The change follows the [library release note
guidelines](https://ddtrace.readthedocs.io/en/stable/releasenotes.html)
- The change includes or references documentation updates if necessary
- Backport labels are set (if
[applicable](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting))

## Reviewer Checklist
- [x] Reviewer has checked that all the criteria below are met 
- Title is accurate
- All changes are related to the pull request's stated goal
- Avoids breaking
[API](https://ddtrace.readthedocs.io/en/stable/versioning.html#interfaces)
changes
- Testing strategy adequately addresses listed risks
- Newly-added code is easy to change
- Release note makes sense to a user of the library
- If necessary, author has acknowledged and discussed the performance
implications of this PR as reported in the benchmarks PR comment
- Backport labels are set in a manner that is consistent with the
[release branch maintenance
policy](https://ddtrace.readthedocs.io/en/latest/contributing.html#backporting)

---------

Signed-off-by: Juanjo Alvarez <[email protected]>
Co-authored-by: erikayasuda <[email protected]>
Co-authored-by: David Sanchez <[email protected]>
Co-authored-by: Taegyun Kim <[email protected]>
Co-authored-by: Federico Mon <[email protected]>
Co-authored-by: Christophe Papazian <[email protected]>
Co-authored-by: Juanjo Alvarez Martinez <[email protected]>
  • Loading branch information
7 people authored Jul 19, 2024
1 parent 6c3db8e commit dc000ae
Show file tree
Hide file tree
Showing 10 changed files with 265 additions and 22 deletions.
12 changes: 12 additions & 0 deletions ddtrace/bootstrap/preload.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
Bootstrapping code that is run when using the `ddtrace-run` Python entrypoint
Add all monkey-patching that needs to run by default here
"""

import os # noqa:I001

from ddtrace import config # noqa:F401
Expand All @@ -15,6 +16,7 @@
from ddtrace.internal.utils.formats import asbool # noqa:F401
from ddtrace.internal.utils.formats import parse_tags_str # noqa:F401
from ddtrace.settings.asm import config as asm_config # noqa:F401
from ddtrace.settings.crashtracker import config as crashtracker_config
from ddtrace.settings.symbol_db import config as symdb_config # noqa:F401
from ddtrace import tracer

Expand All @@ -41,6 +43,16 @@ def register_post_preload(func: t.Callable) -> None:

log = get_logger(__name__)

# DEV: We want to start the crashtracker as early as possible
if crashtracker_config.enabled:
log.debug("crashtracking enabled via environment variable")
try:
from ddtrace.internal.core import crashtracking

crashtracking.start()
except Exception:
log.error("failed to enable crashtracking", exc_info=True)


if profiling_config.enabled:
log.debug("profiler enabled via environment variable")
Expand Down
50 changes: 50 additions & 0 deletions ddtrace/internal/core/crashtracking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from typing import Callable

from ddtrace import config
from ddtrace import version
from ddtrace.internal import agent
from ddtrace.internal.datadog.profiling import crashtracker
from ddtrace.internal.runtime import get_runtime_id
from ddtrace.internal.runtime import on_runtime_id_change
from ddtrace.settings.crashtracker import config as crashtracker_config


is_available: bool = crashtracker.is_available
failure_msg: str = crashtracker.failure_msg
is_started: Callable[[], bool] = crashtracker.is_started


@on_runtime_id_change
def _update_runtime_id(runtime_id: str) -> None:
crashtracker.set_runtime_id(runtime_id)


def start() -> bool:
if not is_available:
return False

crashtracker.set_url(crashtracker_config.debug_url or agent.get_trace_url())
crashtracker.set_service(config.service)
crashtracker.set_version(config.version)
crashtracker.set_env(config.env)
crashtracker.set_runtime_id(get_runtime_id())
crashtracker.set_library_version(version.get_version())
crashtracker.set_alt_stack(bool(crashtracker_config.alt_stack))
if crashtracker_config.stacktrace_resolver == "fast":
crashtracker.set_resolve_frames_fast()
elif crashtracker_config.stacktrace_resolver == "full":
crashtracker.set_resolve_frames_full()
elif crashtracker_config.stacktrace_resolver == "safe":
crashtracker.set_resolve_frames_safe()
else:
crashtracker.set_resolve_frames_disable()

if crashtracker_config.stdout_filename:
crashtracker.set_stdout_filename(crashtracker_config.stdout_filename)
if crashtracker_config.stderr_filename:
crashtracker.set_stderr_filename(crashtracker_config.stderr_filename)

# Only start if it is enabled
if crashtracker_config.enabled:
return crashtracker.start()
return False
13 changes: 13 additions & 0 deletions ddtrace/internal/datadog/profiling/crashtracker/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,23 @@
failure_msg = ""


def _default_return_false(*args, **kwargs):
return False


try:
from ._crashtracker import * # noqa: F403, F401

is_available = True

except Exception as e:
failure_msg = str(e)

# Crashtracker is used early during startup, and so it must be robust across installations.
# Here we just stub everything.
def __getattr__(name):
if name == "failure_msg":
return failure_msg
if name == "is_available":
return False
return _default_return_false
16 changes: 14 additions & 2 deletions ddtrace/internal/runtime/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,22 @@
]


def _generate_runtime_id():
def _generate_runtime_id() -> str:
return uuid.uuid4().hex


_RUNTIME_ID = _generate_runtime_id()
_RUNTIME_ID: str = _generate_runtime_id()
_ANCESTOR_RUNTIME_ID: t.Optional[str] = None
_ON_RUNTIME_ID_CHANGE: t.Set[t.Callable[[str], None]] = set()


def on_runtime_id_change(cb: t.Callable[[str], None]) -> None:
"""Register a callback to be called when the runtime ID changes.
This can happen after a fork.
"""
global _ON_RUNTIME_ID_CHANGE
_ON_RUNTIME_ID_CHANGE.add(cb)


@forksafe.register
Expand All @@ -26,6 +36,8 @@ def _set_runtime_id():
_ANCESTOR_RUNTIME_ID = _RUNTIME_ID

_RUNTIME_ID = _generate_runtime_id()
for cb in _ON_RUNTIME_ID_CHANGE:
cb(_RUNTIME_ID)


def get_runtime_id():
Expand Down
11 changes: 11 additions & 0 deletions ddtrace/internal/telemetry/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,3 +70,14 @@
TELEMETRY_INJECT_WAS_ATTEMPTED = "DD_LIB_INJECTION_ATTEMPTED"
TELEMETRY_LIB_WAS_INJECTED = "DD_LIB_INJECTED"
TELEMETRY_LIB_INJECTION_FORCED = "DD_INJECT_FORCE"


# Crashtracker
TELEMETRY_CRASHTRACKING_ENABLED = "crashtracking_enabled" # Env var enabled
TELEMETRY_CRASHTRACKING_AVAILABLE = "crashtracking_available" # Feature is available
TELEMETRY_CRASHTRACKING_STARTED = "crashtracking_started" # Crashtracking is running
TELEMETRY_CRASHTRACKING_STDOUT_FILENAME = "crashtracking_stdout_filename"
TELEMETRY_CRASHTRACKING_STDERR_FILENAME = "crashtracking_stderr_filename"
TELEMETRY_CRASHTRACKING_ALT_STACK = "crashtracking_alt_stack"
TELEMETRY_CRASHTRACKING_STACKTRACE_RESOLVER = "crashtracking_stacktrace_resolver"
TELEMETRY_CRASHTRACKING_DEBUG_URL = "crashtracking_debug_url"
19 changes: 19 additions & 0 deletions ddtrace/internal/telemetry/writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,14 @@
from ...internal import atexit
from ...internal import forksafe
from ...internal.compat import parse
from ...internal.core import crashtracking
from ...internal.module import BaseModuleWatchdog
from ...internal.module import origin
from ...internal.schema import SCHEMA_VERSION
from ...internal.schema import _remove_client_service_names
from ...settings import _config as config
from ...settings.config import _ConfigSource
from ...settings.crashtracker import config as crashtracker_config
from ...settings.dynamic_instrumentation import config as di_config
from ...settings.exception_debugging import config as ed_config
from ...settings.peer_service import _ps_config
Expand All @@ -47,6 +49,14 @@
from .constants import TELEMETRY_AGENT_URL
from .constants import TELEMETRY_ANALYTICS_ENABLED
from .constants import TELEMETRY_CLIENT_IP_ENABLED
from .constants import TELEMETRY_CRASHTRACKING_ALT_STACK
from .constants import TELEMETRY_CRASHTRACKING_AVAILABLE
from .constants import TELEMETRY_CRASHTRACKING_DEBUG_URL
from .constants import TELEMETRY_CRASHTRACKING_ENABLED
from .constants import TELEMETRY_CRASHTRACKING_STACKTRACE_RESOLVER
from .constants import TELEMETRY_CRASHTRACKING_STARTED
from .constants import TELEMETRY_CRASHTRACKING_STDERR_FILENAME
from .constants import TELEMETRY_CRASHTRACKING_STDOUT_FILENAME
from .constants import TELEMETRY_DOGSTATSD_PORT
from .constants import TELEMETRY_DOGSTATSD_URL
from .constants import TELEMETRY_DYNAMIC_INSTRUMENTATION_ENABLED
Expand Down Expand Up @@ -510,6 +520,15 @@ def _app_started_event(self, register_app_shutdown=True):
(TELEMETRY_INJECT_WAS_ATTEMPTED, config._inject_was_attempted, "unknown"),
(TELEMETRY_LIB_WAS_INJECTED, config._lib_was_injected, "unknown"),
(TELEMETRY_LIB_INJECTION_FORCED, config._inject_force, "unknown"),
# Crashtracker
(TELEMETRY_CRASHTRACKING_ENABLED, crashtracker_config.enabled, "unknown"),
(TELEMETRY_CRASHTRACKING_STARTED, crashtracking.is_started(), "unknown"),
(TELEMETRY_CRASHTRACKING_AVAILABLE, crashtracking.is_available, "unknown"),
(TELEMETRY_CRASHTRACKING_STACKTRACE_RESOLVER, str(crashtracker_config.stacktrace_resolver), "unknown"),
(TELEMETRY_CRASHTRACKING_STDOUT_FILENAME, str(crashtracker_config.stdout_filename), "unknown"),
(TELEMETRY_CRASHTRACKING_STDERR_FILENAME, str(crashtracker_config.stderr_filename), "unknown"),
(TELEMETRY_CRASHTRACKING_DEBUG_URL, str(crashtracker_config.debug_url), "unknown"),
(TELEMETRY_CRASHTRACKING_ALT_STACK, crashtracker_config.alt_stack, "unknown"),
]
+ get_python_config_vars()
)
Expand Down
25 changes: 13 additions & 12 deletions ddtrace/settings/crashtracker.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,27 +2,25 @@

from envier import En

from ddtrace.internal.datadog.profiling import crashtracker


def _derive_stacktrace_resolver(config):
# type: (CrashtrackerConfig) -> t.Optional[str]
resolver = config._stacktrace_resolver or ""
def _derive_stacktrace_resolver(config: "CrashtrackerConfig") -> t.Optional[str]:
resolver = str(config._stacktrace_resolver or "")
resolver = resolver.lower()
if resolver in ("fast", "full"):
if resolver in ("fast", "full", "safe"):
return resolver
return None


def _check_for_crashtracker_available():
def _check_for_crashtracker_available() -> bool:
from ddtrace.internal.datadog.profiling import crashtracker

return crashtracker.is_available


def _derive_crashtracker_enabled(config):
# type: (CrashtrackerConfig) -> bool
def _derive_crashtracker_enabled(config: "CrashtrackerConfig") -> bool:
if not _check_for_crashtracker_available():
return False
return config._enabled
return bool(config._enabled)


class CrashtrackerConfig(En):
Expand All @@ -31,7 +29,7 @@ class CrashtrackerConfig(En):
_enabled = En.v(
bool,
"enabled",
default=False,
default=True,
help_type="Boolean",
help="Enables the crashtracker",
)
Expand Down Expand Up @@ -77,6 +75,9 @@ class CrashtrackerConfig(En):
default=None,
help_type="String",
help="How to collect native stack traces during a crash, if at all. Accepted values are 'none', 'fast',"
" and 'full'. The default value is 'none' (no stack traces).",
" 'safe', and 'full'. The default value is 'none' (no stack traces).",
)
stacktrace_resolver = En.d(t.Optional[str], _derive_stacktrace_resolver)


config = CrashtrackerConfig()
108 changes: 108 additions & 0 deletions tests/internal/crashtracker/test_crashtracker.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import os
import sys

import pytest

import tests.internal.crashtracker.utils as utils


@pytest.mark.skipif(not sys.platform.startswith("linux"), reason="Linux only")
@pytest.mark.subprocess()
Expand Down Expand Up @@ -274,3 +277,108 @@ def test_crashtracker_raise_sigbus():
data = utils.conn_to_bytes(conn)
conn.close()
assert b"os_kill" in data


preload_code = """
import ctypes
ctypes.string_at(0)
exit(-1)
"""


@pytest.mark.skipif(not sys.platform.startswith("linux"), reason="Linux only")
def test_crashtracker_preload_default(ddtrace_run_python_code_in_subprocess):
# Setup the listening socket before we open ddtrace
port, sock = utils.crashtracker_receiver_bind()
assert sock

# Call the program
env = os.environ.copy()
env["DD_TRACE_AGENT_URL"] = "http://localhost:%d" % port
stdout, stderr, exitcode, _ = ddtrace_run_python_code_in_subprocess(preload_code, env=env)

# Check for expected exit condition
assert not stdout
assert not stderr
assert exitcode == -11 # exit code for SIGSEGV

# Wait for the connection
conn = utils.listen_get_conn(sock)
assert conn
data = utils.conn_to_bytes(conn)
assert data


@pytest.mark.skipif(not sys.platform.startswith("linux"), reason="Linux only")
def test_crashtracker_preload_disabled(ddtrace_run_python_code_in_subprocess):
# Setup the listening socket before we open ddtrace
port, sock = utils.crashtracker_receiver_bind()
assert sock

# Call the program
env = os.environ.copy()
env["DD_TRACE_AGENT_URL"] = "http://localhost:%d" % port
env["DD_CRASHTRACKER_ENABLED"] = "false"
stdout, stderr, exitcode, _ = ddtrace_run_python_code_in_subprocess(preload_code, env=env)

# Check for expected exit condition
assert not stdout
assert not stderr
assert exitcode == -11

# Wait for the connection, which should fail
conn = utils.listen_get_conn(sock)
assert not conn


auto_code = """
import ctypes
import ddtrace.auto
ctypes.string_at(0)
exit(-1)
"""


@pytest.mark.skipif(not sys.platform.startswith("linux"), reason="Linux only")
def test_crashtracker_auto_default(run_python_code_in_subprocess):
# Setup the listening socket before we open ddtrace
port, sock = utils.crashtracker_receiver_bind()
assert sock

# Call the program
env = os.environ.copy()
env["DD_TRACE_AGENT_URL"] = "http://localhost:%d" % port
stdout, stderr, exitcode, _ = run_python_code_in_subprocess(auto_code, env=env)

# Check for expected exit condition
assert not stdout
assert not stderr
assert exitcode == -11

# Wait for the connection
conn = utils.listen_get_conn(sock)
assert conn
data = utils.conn_to_bytes(conn)
assert data


@pytest.mark.skipif(not sys.platform.startswith("linux"), reason="Linux only")
def test_crashtracker_auto_disabled(run_python_code_in_subprocess):
# Setup the listening socket before we open ddtrace
port, sock = utils.crashtracker_receiver_bind()
assert sock

# Call the program
env = os.environ.copy()
env["DD_TRACE_AGENT_URL"] = "http://localhost:%d" % port
env["DD_CRASHTRACKER_ENABLED"] = "false"
stdout, stderr, exitcode, _ = run_python_code_in_subprocess(auto_code, env=env)

# Check for expected exit condition
assert not stdout
assert not stderr
assert exitcode == -11

# Wait for the connection, which should fail
conn = utils.listen_get_conn(sock)
assert not conn
Loading

0 comments on commit dc000ae

Please sign in to comment.