Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(opentelemetry): resolve lazy sampling + distributed tracing bug [backport 2.10] #10029

Merged
merged 2 commits into from
Aug 2, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 22 additions & 7 deletions ddtrace/opentelemetry/_span.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from opentelemetry.trace.span import TraceState

from ddtrace import config
from ddtrace import tracer as ddtracer
from ddtrace.constants import ERROR_MSG
from ddtrace.constants import ERROR_STACK
from ddtrace.constants import ERROR_TYPE
Expand Down Expand Up @@ -136,13 +137,27 @@ def kind(self):
def get_span_context(self):
# type: () -> SpanContext
"""Returns an OpenTelemetry SpanContext"""
ts = None
tf = TraceFlags.DEFAULT
if self._ddspan.context:
ts_str = w3c_tracestate_add_p(self._ddspan.context._tracestate, self._ddspan.span_id)
ts = TraceState.from_header([ts_str])
if self._ddspan.context.sampling_priority and self._ddspan.context.sampling_priority > 0:
tf = TraceFlags.SAMPLED
if self._ddspan.context.sampling_priority is None:
# With the introduction of lazy sampling, spans are now sampled on serialization. With this change
# a spans trace flags could be propagated before a sampling
# decision is made. Since the default sampling decision is to unsample spans this can result
# in missing spans. To resolve this issue, a sampling decision must be made the first time
# the span context is accessed.
ddtracer.sample(self._ddspan._local_root or self._ddspan)

if self._ddspan.context.sampling_priority is None:
tf = TraceFlags.DEFAULT
log.warning(
"Span context is missing a sampling decision, defaulting to unsampled: %s", str(self._ddspan.context)
)
elif self._ddspan.context.sampling_priority > 0:
tf = TraceFlags.SAMPLED
else:
tf = TraceFlags.DEFAULT

# Evaluate the tracestate header after the sampling decision has been made
ts_str = w3c_tracestate_add_p(self._ddspan.context._tracestate, self._ddspan.span_id)
ts = TraceState.from_header([ts_str])

return SpanContext(self._ddspan.trace_id, self._ddspan.span_id, False, tf, ts)

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
fixes:
- |
opentelemetry: Resolves an edge case where distributed tracing headers could be generated before a sampling decision is made,
resulting in dropped spans in downstream services.
58 changes: 53 additions & 5 deletions tests/opentelemetry/test_trace.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import mock
import opentelemetry
from opentelemetry.trace import set_span_in_context
from opentelemetry.trace.propagation.tracecontext import TraceContextTextMapPropagator
import opentelemetry.version
import pytest

Expand Down Expand Up @@ -50,7 +52,7 @@ def test_otel_start_span_without_default_args(oteltracer):
root = oteltracer.start_span("root-span")
otel_span = oteltracer.start_span(
"test-start-span",
context=opentelemetry.trace.set_span_in_context(root),
context=set_span_in_context(root),
kind=opentelemetry.trace.SpanKind.CLIENT,
attributes={"start_span_tag": "start_span_val"},
links=None,
Expand Down Expand Up @@ -117,7 +119,7 @@ def test_otel_start_current_span_without_default_args(oteltracer):
with oteltracer.start_as_current_span("root-span") as root:
with oteltracer.start_as_current_span(
"test-start-current-span-no-defualts",
context=opentelemetry.trace.set_span_in_context(root),
context=set_span_in_context(root),
kind=opentelemetry.trace.SpanKind.SERVER,
attributes={"start_current_span_tag": "start_cspan_val"},
links=[],
Expand All @@ -138,6 +140,50 @@ def test_otel_start_current_span_without_default_args(oteltracer):
otel_span.end()


def test_otel_get_span_context_sets_sampling_decision(oteltracer):
with oteltracer.start_span("otel-server") as otelspan:
# Sampling priority is not set on span creation
assert otelspan._ddspan.context.sampling_priority is None
# Ensure the sampling priority is always consistent with traceflags
span_context = otelspan.get_span_context()
# Sampling priority is evaluated when the SpanContext is first accessed
sp = otelspan._ddspan.context.sampling_priority
assert sp is not None
if sp > 0:
assert span_context.trace_flags == 1
else:
assert span_context.trace_flags == 0
# Ensure the sampling priority is always consistent
for _ in range(1000):
otelspan.get_span_context()
assert otelspan._ddspan.context.sampling_priority == sp


def test_distributed_trace_inject(oteltracer): # noqa:F811
with oteltracer.start_as_current_span("test-otel-distributed-trace") as span:
headers = {}
TraceContextTextMapPropagator().inject(headers, set_span_in_context(span))
sp = span.get_span_context()
assert headers["traceparent"] == f"00-{sp.trace_id:032x}-{sp.span_id:016x}-{sp.trace_flags:02x}"
assert headers["tracestate"] == sp.trace_state.to_header()


def test_distributed_trace_extract(oteltracer): # noqa:F811
headers = {
"traceparent": "00-0af7651916cd43dd8448eb211c80319c-b7ad6b7169203331-01",
"tracestate": "congo=t61rcWkgMzE,dd=s:2",
}
context = TraceContextTextMapPropagator().extract(headers)
with oteltracer.start_as_current_span("test-otel-distributed-trace", context=context) as span:
sp = span.get_span_context()
assert sp.trace_id == int("0af7651916cd43dd8448eb211c80319c", 16)
assert span._ddspan.parent_id == int("b7ad6b7169203331", 16)
assert sp.trace_flags == 1
assert sp.trace_state.get("congo") == "t61rcWkgMzE"
assert "s:2" in sp.trace_state.get("dd")
assert sp.is_remote is False


@flaky(1717428664)
@pytest.mark.parametrize(
"flask_wsgi_application,flask_env_arg,flask_port,flask_command",
Expand All @@ -164,10 +210,12 @@ def test_otel_start_current_span_without_default_args(oteltracer):
"with_opentelemetry_instrument",
],
)
@pytest.mark.snapshot(ignores=["metrics.net.peer.port", "meta.traceparent", "meta.flask.version"])
@pytest.mark.snapshot(ignores=["metrics.net.peer.port", "meta.traceparent", "meta.tracestate", "meta.flask.version"])
def test_distributed_trace_with_flask_app(flask_client, oteltracer): # noqa:F811
with oteltracer.start_as_current_span("test-otel-distributed-trace"):
resp = flask_client.get("/otel")
with oteltracer.start_as_current_span("test-otel-distributed-trace") as span:
headers = {}
TraceContextTextMapPropagator().inject(headers, set_span_in_context(span))
resp = flask_client.get("/otel", headers=headers)

assert resp.text == "otel"
assert resp.status_code == 200
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,9 @@
"error": 0,
"meta": {
"_dd.p.dm": "-0",
"_dd.p.tid": "655535db00000000",
"_dd.p.tid": "66a6956800000000",
"language": "python",
"runtime-id": "b4ffa244c11343de919ca20a7e8eebcf"
"runtime-id": "0da930484bcf4b488bfd7bb9cfd9c4b6"
},
"metrics": {
"_dd.top_level": 1,
Expand All @@ -35,11 +35,12 @@
"meta": {
"_dd.base_service": "",
"_dd.p.dm": "-0",
"_dd.p.tid": "655535db00000000",
"_dd.p.tid": "66a6956800000000",
"_dd.parent_id": "4f4e94b0f4d57229",
"component": "flask",
"flask.endpoint": "otel",
"flask.url_rule": "/otel",
"flask.version": "2.1.3",
"flask.version": "1.1.4",
"http.method": "GET",
"http.route": "/otel",
"http.status_code": "200",
Expand All @@ -56,10 +57,10 @@
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
"_sampling_priority_v1": 1,
"process_id": 7713
"process_id": 31660
},
"duration": 14813125,
"start": 1700083163141821300
"duration": 4600000,
"start": 1722193256335520000
},
{
"name": "flask.application",
Expand All @@ -76,8 +77,8 @@
"flask.endpoint": "otel",
"flask.url_rule": "/otel"
},
"duration": 14243208,
"start": 1700083163142061300
"duration": 4219000,
"start": 1722193256335701000
},
{
"name": "flask.try_trigger_before_first_request_functions",
Expand All @@ -92,8 +93,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 14625,
"start": 1700083163142217383
"duration": 12000,
"start": 1722193256335827000
},
{
"name": "flask.preprocess_request",
Expand All @@ -108,8 +109,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 20125,
"start": 1700083163142331717
"duration": 13000,
"start": 1722193256335911000
},
{
"name": "flask.dispatch_request",
Expand All @@ -124,8 +125,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 13437750,
"start": 1700083163142415633
"duration": 3646000,
"start": 1722193256335970000
},
{
"name": "tests.opentelemetry.flask_app.otel",
Expand All @@ -140,8 +141,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 13354542,
"start": 1700083163142480508
"duration": 3585000,
"start": 1722193256336021000
},
{
"name": "internal",
Expand All @@ -155,8 +156,8 @@
"meta": {
"_dd.base_service": ""
},
"duration": 52875,
"start": 1700083163155755800
"duration": 36000,
"start": 1722193256339552000
},
{
"name": "flask.process_response",
Expand All @@ -171,8 +172,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 32375,
"start": 1700083163156004175
"duration": 17000,
"start": 1722193256339703000
},
{
"name": "flask.do_teardown_request",
Expand All @@ -187,8 +188,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 31625,
"start": 1700083163156192133
"duration": 14000,
"start": 1722193256339831000
},
{
"name": "flask.do_teardown_appcontext",
Expand All @@ -203,8 +204,8 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 9292,
"start": 1700083163156278258
"duration": 10000,
"start": 1722193256339897000
},
{
"name": "flask.response",
Expand All @@ -219,6 +220,6 @@
"_dd.base_service": "",
"component": "flask"
},
"duration": 311541,
"start": 1700083163156314342
"duration": 183000,
"start": 1722193256339931000
}]]
Original file line number Diff line number Diff line change
Expand Up @@ -10,31 +10,32 @@
"error": 0,
"meta": {
"_dd.p.dm": "-0",
"_dd.p.tid": "655535dc00000000",
"_dd.p.tid": "66a666b600000000",
"language": "python",
"runtime-id": "b4ffa244c11343de919ca20a7e8eebcf"
"runtime-id": "4fe738d296c04c00aff7eb2a66389ad4"
},
"metrics": {
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
"_sampling_priority_v1": 1,
"process_id": 7703
"process_id": 75526
},
"duration": 24467250,
"start": 1700083164080082009
"duration": 3561000,
"start": 1722181302828129000
},
{
"name": "server",
"service": "",
"resource": "/otel",
"resource": "GET /otel",
"trace_id": 0,
"span_id": 2,
"parent_id": 1,
"type": "",
"error": 0,
"meta": {
"_dd.p.dm": "-0",
"_dd.p.tid": "655535dc00000000",
"_dd.p.tid": "66a666b600000000",
"_dd.parent_id": "ae6e2bf21738f62f",
"http.flavor": "1.1",
"http.host": "0.0.0.0:8001",
"http.method": "GET",
Expand All @@ -45,21 +46,22 @@
"http.target": "/otel",
"http.user_agent": "python-requests/2.28.1",
"language": "python",
"net.host.name": "0.0.0.0:8001",
"net.peer.ip": "127.0.0.1",
"runtime-id": "f90fe90fc53a4388b02210639d156981",
"runtime-id": "0dacca250fe4471094dc593f4892b91f",
"span.kind": "server",
"tracestate": "dd=s:1;t.dm:-0"
"tracestate": "dd=p:ae6e2bf21738f62f;s:1;t.dm:-0"
},
"metrics": {
"_dd.top_level": 1,
"_dd.tracer_kr": 1.0,
"_sampling_priority_v1": 1,
"net.host.port": 8001,
"net.peer.port": 65508,
"process_id": 7721
"net.peer.port": 62683,
"process_id": 75530
},
"duration": 413250,
"start": 1700083164081574592
"duration": 515000,
"start": 1722181302830460000
},
{
"name": "internal",
Expand All @@ -70,6 +72,6 @@
"parent_id": 2,
"type": "",
"error": 0,
"duration": 18250,
"start": 1700083164081867259
"duration": 16000,
"start": 1722181302830847000
}]]
Loading