Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[jaeger-v2] Implement UTF-8 sanitizer for OTLP #6078

Open
wants to merge 12 commits into
base: main
Choose a base branch
from
1 change: 1 addition & 0 deletions cmd/jaeger/internal/sanitizer/sanitizer.go
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@ type Func func(traces ptrace.Traces) ptrace.Traces
func NewStandardSanitizers() []Func {
return []Func{
NewEmptyServiceNameSanitizer(),
NewUTF8Sanitizer(),
}
}

Expand Down
2 changes: 1 addition & 1 deletion cmd/jaeger/internal/sanitizer/sanitizer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ import (

func TestNewStandardSanitizers(t *testing.T) {
sanitizers := NewStandardSanitizers()
require.Len(t, sanitizers, 1)
require.Len(t, sanitizers, 2)
}

func TestNewChainedSanitizer(t *testing.T) {
Expand Down
93 changes: 93 additions & 0 deletions cmd/jaeger/internal/sanitizer/utf8.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
// Copyright (c) 2024 The Jaeger Authors.
// SPDX-License-Identifier: Apache-2.0

package sanitizer

import (
"unicode/utf8"

"go.opentelemetry.io/collector/pdata/pcommon"
"go.opentelemetry.io/collector/pdata/ptrace"
)

const (
invalidSpanName = "invalid-span-name"
invalidTagKey = "invalid-tag-key"
)

// NewUTF8Sanitizer returns a sanitizer function that performs the following sanitizations
// on the trace data:
// - Checks all attributes of the span to ensure that the keys are valid UTF-8 strings
// and all string typed values are valid UTF-8 strings. If any keys or values are invalid,
// they are replaced with a valid UTF-8 string containing debugging data related to the invalidations.
// - Explicitly checks that all span names are valid UTF-8 strings. If they are not, they are replaced
// with a valid UTF-8 string and debugging information is put into the attributes of the span.
// - Explicitly checks that the service name is a valid UTF-8 string. If it is not, it is replaced with
// a valid UTF-8 string that contains debugging information related to the invalidation.
func NewUTF8Sanitizer() Func {
return sanitizeUTF8
}

func sanitizeUTF8(traces ptrace.Traces) ptrace.Traces {
resourceSpans := traces.ResourceSpans()
for i := 0; i < resourceSpans.Len(); i++ {
resourceSpan := resourceSpans.At(i)
sanitizeAttributes(resourceSpan.Resource().Attributes())

scopeSpans := resourceSpan.ScopeSpans()
for j := 0; j < scopeSpans.Len(); j++ {
scopeSpan := scopeSpans.At(j)
sanitizeAttributes(scopeSpan.Scope().Attributes())

spans := scopeSpan.Spans()

for k := 0; k < spans.Len(); k++ {
span := spans.At(k)

if !utf8.ValidString(span.Name()) {
sanitized := []byte(span.Name())
newVal := span.Attributes().PutEmptyBytes(invalidSpanName)
newVal.Append(sanitized...)
span.SetName(invalidSpanName)
}

sanitizeAttributes(span.Attributes())
}
}
}

return traces
}

func sanitizeAttributes(attributes pcommon.Map) {
// collect invalid keys during iteration to avoid changing the keys of the map
// while iterating over the map using Range
invalidKeys := make(map[string]pcommon.Value)

attributes.Range(func(k string, v pcommon.Value) bool {
if !utf8.ValidString(k) {
invalidKeys[k] = v
}

if v.Type() == pcommon.ValueTypeStr && !utf8.ValidString(v.Str()) {
sanitized := []byte(v.Str())
newVal := attributes.PutEmptyBytes(k)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This contradicts the comment above about modifying the attributes while iterating over them.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yurishkuro the comment was for modifying the keys of the map. is there anything wrong with modifying the value associated with a key?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know, the command is put(), it may depend on the implementation.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It looks to just be changing the value if one already exists. We could do this in two passes if you prefer however.

func (m Map) PutEmptyBytes(k string) ByteSlice {
	m.getState().AssertMutable()
	bv := otlpcommon.AnyValue_BytesValue{}
	if av, existing := m.Get(k); existing {
		av.getOrig().Value = &bv
	} else {
		*m.getOrig() = append(*m.getOrig(), otlpcommon.KeyValue{Key: k, Value: otlpcommon.AnyValue{Value: &bv}})
	}
	return ByteSlice(internal.NewByteSlice(&bv.BytesValue, m.getState()))
}

newVal.Append(sanitized...)
}
return true
})

for k, v := range invalidKeys {
sanitized := []byte(k + ":")
switch v.Type() {
Copy link
Contributor Author

@mahadzaryab1 mahadzaryab1 Oct 12, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

From the godoc of Type()

// Type returns the type of the value for this Value.
// Calling this function on zero-initialized Value will cause a panic.

Can we run into a situation where we call this on a zero-initialized value in the regular exporter flow?

case pcommon.ValueTypeBytes:
sanitized = append(sanitized, v.Bytes().AsRaw()...)
default:
sanitized = append(sanitized, []byte(v.AsString())...)
}

attributes.Remove(k)
newVal := attributes.PutEmptyBytes(invalidTagKey)
newVal.Append(sanitized...)
}
}
250 changes: 250 additions & 0 deletions cmd/jaeger/internal/sanitizer/utf8_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,250 @@
// Copyright (c) 2024 The Jaeger Authors.
// SPDX-License-Identifier: Apache-2.0

package sanitizer

import (
"encoding/hex"
"fmt"
"testing"

"github.com/stretchr/testify/require"
"go.opentelemetry.io/collector/pdata/pcommon"
"go.opentelemetry.io/collector/pdata/ptrace"
)

func invalidUTF8() string {
s, _ := hex.DecodeString("fefeffff")
return string(s)
}

func getBytesValueFromString(s string) pcommon.Value {
b := pcommon.NewValueBytes()
b.Bytes().Append([]byte(s)...)
return b
}

var utf8EncodingTests = []struct {
name string
key string
value string
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It seems if your tests only allow string values you cannot test other attribute types.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@yurishkuro I added some separate tests for non-string attributes

expectedKey string
expectedValue pcommon.Value
}{
{
name: "valid key + valid value",
key: "key",
value: "value",
expectedKey: "key",
expectedValue: pcommon.NewValueStr("value"),
},
{
name: "invalid key + valid value",
key: invalidUTF8(),
value: "value",
expectedKey: "invalid-tag-key",
expectedValue: getBytesValueFromString(fmt.Sprintf("%s:value", invalidUTF8())),
},
{
name: "valid key + invalid value",
key: "key",
value: invalidUTF8(),
expectedKey: "key",
expectedValue: getBytesValueFromString(invalidUTF8()),
},
{
name: "invalid key + invalid value",
key: invalidUTF8(),
value: invalidUTF8(),
expectedKey: "invalid-tag-key",
expectedValue: getBytesValueFromString(fmt.Sprintf("%s:%s", invalidUTF8(), invalidUTF8())),
},
}

func TestUTF8Sanitizer_SanitizesResourceSpanAttributes(t *testing.T) {
tests := utf8EncodingTests
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
traces := ptrace.NewTraces()
traces.
ResourceSpans().
AppendEmpty().
Resource().
Attributes().
PutStr(test.key, test.value)
sanitizer := NewUTF8Sanitizer()
sanitized := sanitizer(traces)
value, ok := sanitized.
ResourceSpans().
At(0).
Resource().
Attributes().
Get(test.expectedKey)
require.True(t, ok)
require.Equal(t, test.expectedValue, value)
})
}
}

func TestUTF8Sanitizer_SanitizesScopeSpanAttributes(t *testing.T) {
tests := utf8EncodingTests
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
traces := ptrace.NewTraces()
traces.
ResourceSpans().
AppendEmpty().
ScopeSpans().
AppendEmpty().
Scope().
Attributes().
PutStr(test.key, test.value)
sanitizer := NewUTF8Sanitizer()
sanitized := sanitizer(traces)
value, ok := sanitized.
ResourceSpans().
At(0).
ScopeSpans().
At(0).
Scope().
Attributes().
Get(test.expectedKey)
require.True(t, ok)
require.Equal(t, test.expectedValue, value)
})
}
}

func TestUTF8Sanitizer_SanitizesSpanAttributes(t *testing.T) {
tests := utf8EncodingTests
for _, test := range tests {
t.Run(test.name, func(t *testing.T) {
traces := ptrace.NewTraces()
traces.
ResourceSpans().
AppendEmpty().
ScopeSpans().
AppendEmpty().
Spans().
AppendEmpty().
Attributes().
PutStr(test.key, test.value)
sanitizer := NewUTF8Sanitizer()
sanitized := sanitizer(traces)
value, ok := sanitized.
ResourceSpans().
At(0).
ScopeSpans().
At(0).
Spans().
At(0).
Attributes().
Get(test.expectedKey)
require.True(t, ok)
require.Equal(t, test.expectedValue, value)
})
}
}

func TestUTF8Sanitizer_SanitizesInvalidSpanName(t *testing.T) {
traces := ptrace.NewTraces()
traces.
ResourceSpans().
AppendEmpty().
ScopeSpans().
AppendEmpty().
Spans().
AppendEmpty().
SetName(invalidUTF8())
sanitizer := NewUTF8Sanitizer()
sanitized := sanitizer(traces)
name := sanitized.
ResourceSpans().
At(0).
ScopeSpans().
At(0).
Spans().
At(0).
Name()
require.Equal(t, "invalid-span-name", name)
}

func TestUTF8Sanitizer_DoesNotSanitizeInvalidSpanName(t *testing.T) {
traces := ptrace.NewTraces()
traces.
ResourceSpans().
AppendEmpty().
ScopeSpans().
AppendEmpty().
Spans().
AppendEmpty().
SetName("name")
sanitizer := NewUTF8Sanitizer()
sanitized := sanitizer(traces)
name := sanitized.
ResourceSpans().
At(0).
ScopeSpans().
At(0).
Spans().
At(0).
Name()
require.Equal(t, "name", name)
}

func TestUTF8Sanitizer_RemovesInvalidKeys(t *testing.T) {
traces := ptrace.NewTraces()
traces.
ResourceSpans().
AppendEmpty().
Resource().
Attributes().
PutStr(invalidUTF8(), "value")
sanitizer := NewUTF8Sanitizer()
sanitized := sanitizer(traces)
_, ok := sanitized.
ResourceSpans().
At(0).
Resource().
Attributes().
Get(invalidUTF8())
require.False(t, ok)
}

func TestUTF8Sanitizer_DoesNotSanitizeNonStringAttributeValue(t *testing.T) {
traces := ptrace.NewTraces()
traces.
ResourceSpans().
AppendEmpty().
Resource().
Attributes().PutInt("key", 1)
sanitizer := NewUTF8Sanitizer()
sanitized := sanitizer(traces)
value, ok := sanitized.
ResourceSpans().
At(0).
Resource().
Attributes().
Get("key")
require.True(t, ok)
require.EqualValues(t, 1, value.Int())
}

func TestUTF8Sanitizer_SanitizesNonStringAttributeValueWithInvalidKey(t *testing.T) {
traces := ptrace.NewTraces()
traces.
ResourceSpans().
AppendEmpty().
Resource().
Attributes().PutInt(invalidUTF8(), 1)
sanitizer := NewUTF8Sanitizer()
sanitized := sanitizer(traces)
value, ok := sanitized.
ResourceSpans().
At(0).
Resource().
Attributes().
Get("invalid-tag-key")
require.True(t, ok)
require.EqualValues(t, getBytesValueFromString(fmt.Sprintf("%s:1", invalidUTF8())), value)
}
Loading