From b69d1f596755cd1957c41b3202da5f507851a780 Mon Sep 17 00:00:00 2001 From: zirain Date: Thu, 26 Sep 2024 17:54:38 +0800 Subject: [PATCH] Set status code base on GRPC status code for OpenTelemetry tracers (#36173) Commit Message: Set status code base on GRPC status code for OpenTelemetry tracers Additional Description: related to https://github.com/istio/istio/issues/53025 Risk Level: Testing: Docs Changes: Release Notes: Platform Specific Features: [Optional Runtime guard:] [Optional Fixes #Issue] [Optional Fixes commit #PR or SHA] [Optional Deprecated:] [Optional [API Considerations](https://github.com/envoyproxy/envoy/blob/main/api/review_checklist.md):] --------- Signed-off-by: zirain --- changelogs/current.yaml | 3 + .../tracers/opentelemetry/tracer.cc | 37 ++++++++- .../opentelemetry_tracer_impl_test.cc | 83 +++++++++++++++++++ 3 files changed, 122 insertions(+), 1 deletion(-) diff --git a/changelogs/current.yaml b/changelogs/current.yaml index 064f4d8e8035..46f6352cf4af 100644 --- a/changelogs/current.yaml +++ b/changelogs/current.yaml @@ -109,6 +109,9 @@ minor_behavior_changes: change: | Enhanced listener filter chain execution to include the case that listener filter has maxReadBytes() of 0, but may return StopIteration in onAccept to wait for asynchronous callback. +- area: tracers + change: | + Set status code based on GRPC status code for OpenTelemetry tracers (previously unset). - area: http2 change: | Changes the default value of ``envoy.reloadable_features.http2_use_oghttp2`` to ``false``. This changes the codec used for HTTP/2 diff --git a/source/extensions/tracers/opentelemetry/tracer.cc b/source/extensions/tracers/opentelemetry/tracer.cc index 1bfb366ae634..446a66c3b952 100644 --- a/source/extensions/tracers/opentelemetry/tracer.cc +++ b/source/extensions/tracers/opentelemetry/tracer.cc @@ -122,8 +122,43 @@ void Span::setAttribute(absl::string_view name, const OTelAttribute& attribute_v *span_.add_attributes() = key_value; } +::opentelemetry::proto::trace::v1::Status_StatusCode +convertGrpcStatusToTraceStatusCode(::opentelemetry::proto::trace::v1::Span_SpanKind kind, + absl::string_view value) { + uint64_t grpc_status_code; + if (!absl::SimpleAtoi(value, &grpc_status_code)) { + // If the value is not a number, we can't map it to a status code. + // In this case, we should leave the status code unset. + return ::opentelemetry::proto::trace::v1::Status::STATUS_CODE_UNSET; + } + + Grpc::Status::GrpcStatus grpc_status = static_cast(grpc_status_code); + // Check mapping https://opentelemetry.io/docs/specs/semconv/rpc/grpc/#grpc-status + if (kind == ::opentelemetry::proto::trace::v1::Span::SPAN_KIND_CLIENT) { + if (grpc_status == Grpc::Status::WellKnownGrpcStatus::Ok) { + return ::opentelemetry::proto::trace::v1::Status::STATUS_CODE_UNSET; + } + return ::opentelemetry::proto::trace::v1::Status::STATUS_CODE_ERROR; + } + + // SPAN_KIND_SERVER + switch (grpc_status) { + case Grpc::Status::WellKnownGrpcStatus::Unknown: + case Grpc::Status::WellKnownGrpcStatus::DeadlineExceeded: + case Grpc::Status::WellKnownGrpcStatus::Unimplemented: + case Grpc::Status::WellKnownGrpcStatus::Internal: + case Grpc::Status::WellKnownGrpcStatus::Unavailable: + case Grpc::Status::WellKnownGrpcStatus::DataLoss: + return ::opentelemetry::proto::trace::v1::Status::STATUS_CODE_ERROR; + default: + return ::opentelemetry::proto::trace::v1::Status::STATUS_CODE_UNSET; + } +} + void Span::setTag(absl::string_view name, absl::string_view value) { - if (name == Tracing::Tags::get().HttpStatusCode) { + if (name == Tracing::Tags::get().GrpcStatusCode) { + span_.mutable_status()->set_code(convertGrpcStatusToTraceStatusCode(span_.kind(), value)); + } else if (name == Tracing::Tags::get().HttpStatusCode) { uint64_t status_code; // For HTTP status codes in the 5xx range, as well as any other code the client failed to // interpret, span status MUST be set to Error. diff --git a/test/extensions/tracers/opentelemetry/opentelemetry_tracer_impl_test.cc b/test/extensions/tracers/opentelemetry/opentelemetry_tracer_impl_test.cc index 1be53b67e720..25a93000931f 100644 --- a/test/extensions/tracers/opentelemetry/opentelemetry_tracer_impl_test.cc +++ b/test/extensions/tracers/opentelemetry/opentelemetry_tracer_impl_test.cc @@ -686,6 +686,89 @@ TEST_F(OpenTelemetryDriverTest, ExportOTLPSpanWithAttributesAndStatus) { EXPECT_EQ(1U, stats_.counter("tracing.opentelemetry.spans_sent").value()); } +// Verifies Grpc spans are exported with their attributes and status +TEST_F(OpenTelemetryDriverTest, ExportOTLPGRPCSpanWithAttributesAndStatus) { + setupValidDriver(); + Tracing::TestTraceContextImpl request_headers{ + {":authority", "test.com"}, {":path", "/"}, {":method", "GET"}}; + NiceMock& mock_random_generator_ = + context_.server_factory_context_.api_.random_; + int64_t generated_int = 1; + EXPECT_CALL(mock_random_generator_, random()).Times(3).WillRepeatedly(Return(generated_int)); + SystemTime timestamp = time_system_.systemTime(); + ON_CALL(stream_info_, startTime()).WillByDefault(Return(timestamp)); + + Tracing::SpanPtr span = driver_->startSpan(mock_tracing_config_, request_headers, stream_info_, + operation_name_, {Tracing::Reason::Sampling, true}); + EXPECT_NE(span.get(), nullptr); + + span->setTag("first_tag_name", "first_tag_value"); + span->setTag("second_tag_name", "second_tag_value"); + // Try an empty tag. + span->setTag("", "empty_tag_value"); + // Overwrite a tag. + span->setTag("first_tag_name", "first_tag_new_value"); + span->setTag("http.status_code", "200"); + span->setTag("grpc.status_code", "13"); + span->setTag("grpc.message", "connect Canceled randomly"); + + // Note the placeholders for the bytes - cleaner to manually set after. + constexpr absl::string_view request_yaml = R"( +resource_spans: + resource: + attributes: + key: "service.name" + value: + string_value: "unknown_service:envoy" + key: "key1" + value: + string_value: "val1" + scope_spans: + spans: + trace_id: "AAA" + span_id: "AAA" + name: "test" + kind: SPAN_KIND_SERVER + start_time_unix_nano: {} + end_time_unix_nano: {} + status: + code: STATUS_CODE_ERROR + attributes: + - key: "first_tag_name" + value: + string_value: "first_tag_new_value" + - key: "second_tag_name" + value: + string_value: "second_tag_value" + - key: "http.status_code" + value: + string_value: "200" + - key: "grpc.status_code" + value: + string_value: "13" + - key: "grpc.message" + value: + string_value: "connect Canceled randomly" + )"; + opentelemetry::proto::collector::trace::v1::ExportTraceServiceRequest request_proto; + int64_t timestamp_ns = std::chrono::nanoseconds(timestamp.time_since_epoch()).count(); + TestUtility::loadFromYaml(fmt::format(request_yaml, timestamp_ns, timestamp_ns), request_proto); + std::string generated_int_hex = Hex::uint64ToHex(generated_int); + auto* expected_span = + request_proto.mutable_resource_spans(0)->mutable_scope_spans(0)->mutable_spans(0); + expected_span->set_trace_id( + absl::HexStringToBytes(absl::StrCat(generated_int_hex, generated_int_hex))); + expected_span->set_span_id(absl::HexStringToBytes(absl::StrCat(generated_int_hex))); + + EXPECT_CALL(runtime_.snapshot_, getInteger("tracing.opentelemetry.min_flush_spans", 5U)) + .Times(1) + .WillRepeatedly(Return(1)); + EXPECT_CALL(*mock_stream_ptr_, + sendMessageRaw_(Grpc::ProtoBufferEqIgnoreRepeatedFieldOrdering(request_proto), _)); + span->finishSpan(); + EXPECT_EQ(1U, stats_.counter("tracing.opentelemetry.spans_sent").value()); +} + // Not sampled spans are ignored TEST_F(OpenTelemetryDriverTest, IgnoreNotSampledSpan) { setupValidDriver();