Skip to content

Commit

Permalink
feat: Log all span-starts for the first minute (#762)
Browse files Browse the repository at this point in the history
Datadog Support has asked us to log all span-starts. I'm hoping that they
can get what they need if we just log the first minute of spans, since
the problem seems to be related to some of the first requests after deploy
or restart.

Excluding function-docstring linting for the span processor, since those
are essentially overrides. (pylint only started caring once one of the
methods was more than 4 lines long.)
  • Loading branch information
timmc-edx authored Aug 9, 2024
1 parent 36bc403 commit 1345db5
Show file tree
Hide file tree
Showing 4 changed files with 54 additions and 3 deletions.
6 changes: 6 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ Change Log
Unreleased
~~~~~~~~~~

[4.1.0] - 2024-08-09
~~~~~~~~~~~~~~~~~~~~
Changed
-------
* Datadog diagnostics will now log all span-starts for the first minute after server startup

[4.0.0] - 2024-08-05
~~~~~~~~~~~~~~~~~~~~
Changed
Expand Down
2 changes: 1 addition & 1 deletion edx_arch_experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,4 @@
A plugin to include applications under development by the architecture team at 2U.
"""

__version__ = '4.0.0'
__version__ = '4.1.0'
20 changes: 20 additions & 0 deletions edx_arch_experiments/datadog_diagnostics/apps.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
"""

import logging
import time

from django.apps import AppConfig
from django.conf import settings
Expand All @@ -26,20 +27,39 @@
# avoids logging more data than is actually needed for diagnosis.
DATADOG_DIAGNOSTICS_MAX_SPANS = getattr(settings, 'DATADOG_DIAGNOSTICS_MAX_SPANS', 100)

# .. setting_name: DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD
# .. setting_default: 60
# .. setting_description: Log all span starts for this many seconds after worker
# startup.
DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD = getattr(
settings,
'DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD',
60
)


# pylint: disable=missing-function-docstring
class MissingSpanProcessor:
"""Datadog span processor that logs unfinished spans at shutdown."""

def __init__(self):
self.spans_started = 0
self.spans_finished = 0
self.open_spans = {}
self.log_all_until = time.time() + DATADOG_DIAGNOSTICS_LOG_ALL_SPAN_STARTS_PERIOD

def on_span_start(self, span):
self.spans_started += 1
if len(self.open_spans) < DATADOG_DIAGNOSTICS_MAX_SPANS:
self.open_spans[span.span_id] = span

# We believe that the anomalous traces always come from a
# single span that is created early in the lifetime of a
# gunicorn worker. If we log *every* span-start in this early
# period, we may be able to observe something interesting.
if time.time() <= self.log_all_until:
log.info(f"Early span-start sample: {span._pprint()}") # pylint: disable=protected-access

def on_span_finish(self, span):
self.spans_finished += 1
self.open_spans.pop(span.span_id, None) # "delete if present"
Expand Down
29 changes: 27 additions & 2 deletions edx_arch_experiments/datadog_diagnostics/tests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Tests for plugin app.
"""

from unittest.mock import patch
from unittest.mock import call, patch

from django.test import TestCase

Expand Down Expand Up @@ -47,5 +47,30 @@ def test_logging(self, mock_log_error, mock_log_info):
proc.on_span_start(FakeSpan(17))
proc.shutdown(0)

mock_log_info.assert_called_once_with("Spans created = 1; spans finished = 0")
assert mock_log_info.call_args_list == [
call("Early span-start sample: span_id=17"),
call("Spans created = 1; spans finished = 0"),
]
mock_log_error.assert_called_once_with("Span created but not finished: span_id=17")

@patch('edx_arch_experiments.datadog_diagnostics.apps.log.info')
def test_early_span_logging_cutoff(self, mock_log_info):
with patch('edx_arch_experiments.datadog_diagnostics.apps.time.time', side_effect=[
# Setup
1700000000,
# Span-start time checks
1700000020,
1700000040,
1700010000,
]):
proc = apps.MissingSpanProcessor()
# Three spans are started
proc.on_span_start(FakeSpan(44))
proc.on_span_start(FakeSpan(45))
proc.on_span_start(FakeSpan(46))

# Just two early span-starts are logged
assert mock_log_info.call_args_list == [
call("Early span-start sample: span_id=44"),
call("Early span-start sample: span_id=45"),
]

0 comments on commit 1345db5

Please sign in to comment.