Skip to content

Commit

Permalink
symbolication: support uploading local symbols remotely
Browse files Browse the repository at this point in the history
  • Loading branch information
Gandem committed Jun 20, 2024
1 parent 612ff65 commit 941648c
Show file tree
Hide file tree
Showing 16 changed files with 425 additions and 27 deletions.
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,7 @@ DD_API_KEY=your-api-key # required
DD_SITE=datadoghq.com # optional, defaults to "datadoghq.com"
OTEL_PROFILING_AGENT_SERVICE=my-service # optional, defaults to "otel-profiling-agent-dev"
OTEL_PROFILING_AGENT_REPORTER_INTERVAL=10s # optional, defaults to 60s
DD_EXPERIMENTAL_LOCAL_SYMBOL_UPLOAD=true # optional, defaults to false
```

Then, you can run the agent with the following command:
Expand Down
9 changes: 7 additions & 2 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,15 @@ services:
- arch=${ARCH:?error}
privileged: true
pid: "host"
environment:
DD_SITE: ${DD_SITE:-datadoghq.com}
DD_EXPERIMENTAL_LOCAL_SYMBOL_UPLOAD: ${DD_EXPERIMENTAL_LOCAL_SYMBOL_UPLOAD:-false}
volumes:
- .:/agent
- /var/run/docker.sock:/var/run/docker.sock:ro
command: bash -c "sudo mount -t debugfs none /sys/kernel/debug && make && sudo /agent/otel-profiling-agent -tags 'service:${OTEL_PROFILING_AGENT_SERVICE:-otel-profiling-agent-dev};remote_symbols:yes' -collection-agent "http://datadog-agent:8126" -reporter-interval ${OTEL_PROFILING_AGENT_REPORTER_INTERVAL:-60s} -samples-per-second 20 -save-cpuprofile"
secrets:
- dd-api-key
command: ['/bin/sh', '-c', 'export DD_API_KEY=$$(cat /run/secrets/dd-api-key); sudo mount -t debugfs none /sys/kernel/debug && make && sudo -E /agent/otel-profiling-agent -tags "service:${OTEL_PROFILING_AGENT_SERVICE:-otel-profiling-agent-dev};remote_symbols:yes" -collection-agent "http://datadog-agent:8126" -reporter-interval ${OTEL_PROFILING_AGENT_REPORTER_INTERVAL:-60s} -samples-per-second 20 -save-cpuprofile']

datadog-agent:
image: gcr.io/datadoghq/agent:7
Expand All @@ -24,7 +29,7 @@ services:
- /sys/fs/cgroup/:/host/sys/fs/cgroup:ro
secrets:
- dd-api-key
entrypoint: [ '/bin/sh', '-c', 'export DD_API_KEY=$$(cat /run/secrets/dd-api-key) ; /bin/entrypoint.sh' ]
entrypoint: ['/bin/sh', '-c', 'export DD_API_KEY=$$(cat /run/secrets/dd-api-key) ; /bin/entrypoint.sh']

secrets:
dd-api-key:
Expand Down
66 changes: 62 additions & 4 deletions libpf/pfelf/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ import (
"io"
"os"
"path/filepath"
"slices"
"sort"
"syscall"
"unsafe"
Expand All @@ -49,8 +50,12 @@ const (
// parsed sections (e.g. symbol tables and string tables; libxul
// has about 4MB .dynstr)
maxBytesLargeSection = 16 * 1024 * 1024
buildIDSectionName = ".note.gnu.build-id"
)

var debugStrSectionNames = []string{".debug_str", ".zdebug_str", ".debug_str.dwo"}
var debugInfoSectionNames = []string{".debug_info", ".zdebug_info"}

// ErrSymbolNotFound is returned when requested symbol was not found
var ErrSymbolNotFound = errors.New("symbol not found")

Expand Down Expand Up @@ -109,6 +114,11 @@ type File struct {
// bias is the load bias for ELF files inside core dump
bias libpf.Address

// filePath is the path of the ELF file as opened by os.Open()
// This can be a path to a mapping file, or a path to the original ELF binary.
// This is empty is the file is opened from a coredump.
filePath string

// InsideCore indicates that this ELF is mapped from a coredump ELF
InsideCore bool

Expand Down Expand Up @@ -168,7 +178,7 @@ func Open(name string) (*File, error) {
return nil, err
}

ff, err := newFile(buffered, f, 0, false)
ff, err := newFile(name, buffered, f, 0, false)
if err != nil {
f.Close()
return nil, err
Expand All @@ -186,13 +196,15 @@ func (f *File) Close() (err error) {
}

// NewFile creates a new ELF file object that borrows the given reader.
func NewFile(r io.ReaderAt, loadAddress uint64, hasMusl bool) (*File, error) {
return newFile(r, nil, loadAddress, hasMusl)
func NewFile(path string, r io.ReaderAt, loadAddress uint64, hasMusl bool) (*File, error) {
return newFile(path, r, nil, loadAddress, hasMusl)
}

func newFile(r io.ReaderAt, closer io.Closer, loadAddress uint64, hasMusl bool) (*File, error) {
func newFile(path string, r io.ReaderAt, closer io.Closer,
loadAddress uint64, hasMusl bool) (*File, error) {
f := &File{
elfReader: r,
filePath: path,
InsideCore: loadAddress != 0,
closer: closer,
}
Expand Down Expand Up @@ -882,3 +894,49 @@ func (f *File) DynString(tag elf.DynTag) ([]string, error) {
func (f *File) IsGolang() bool {
return f.Section(".go.buildinfo") != nil || f.Section(".gopclntab") != nil
}

func (f *File) FilePath() (string, error) {
if f.InsideCore {
return "", errors.New("file path not available for ELF inside coredump")
}
return f.filePath, nil
}

// HasDWARFData is a copy of pfelf.HasDWARFData, but for the libpf.File interface.
func (f *File) HasDWARFData() bool {
hasBuildID := false
hasDebugStr := false
for _, section := range f.Sections {
// NOBITS indicates that the section is actually empty, regardless of the size in the
// section header.
if section.Type == elf.SHT_NOBITS {
continue
}

if section.Name == buildIDSectionName {
hasBuildID = true
}

if slices.Contains(debugStrSectionNames, section.Name) {
hasDebugStr = section.Size > 0
}

// Some files have suspicious near-empty, partially stripped sections; consider them as not
// having DWARF data.
// The simplest binary gcc 10 can generate ("return 0") has >= 48 bytes for each section.
// Let's not worry about executables that may not verify this, as they would not be of
// interest to us.
if section.Size < 32 {
continue
}

if slices.Contains(debugInfoSectionNames, section.Name) {
return true
}
}

// Some alternate debug files only have a .debug_str section. For these we want to return true.
// Use the absence of program headers and presence of a Build ID as heuristic to identify
// alternate debug files.
return len(f.Progs) == 0 && hasBuildID && hasDebugStr
}
2 changes: 1 addition & 1 deletion libpf/process/coredump.go
Original file line number Diff line number Diff line change
Expand Up @@ -564,5 +564,5 @@ func (cf *CoredumpFile) ReadAt(p []byte, addr int64) (int, error) {
// The returned `pfelf.File` is borrowing the coredump file. Closing it will not close the
// underlying CoredumpFile.
func (cf *CoredumpFile) OpenELF() (*pfelf.File, error) {
return pfelf.NewFile(cf, cf.Base, cf.parent.hasMusl)
return pfelf.NewFile(cf.Name, cf, cf.Base, cf.parent.hasMusl)
}
2 changes: 1 addition & 1 deletion libpf/process/process.go
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ func (sp *systemProcess) OpenELF(file string) (*pfelf.File, error) {
if err != nil {
return nil, fmt.Errorf("failed to extract VDSO: %v", err)
}
return pfelf.NewFile(vdso, 0, false)
return pfelf.NewFile(file, vdso, 0, false)
}
ef, err := pfelf.Open(sp.GetMappingFile(m))
if err == nil {
Expand Down
17 changes: 16 additions & 1 deletion main.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
"github.com/elastic/otel-profiling-agent/metrics/agentmetrics"
"github.com/elastic/otel-profiling-agent/reporter"

"github.com/elastic/otel-profiling-agent/symbolication"
"github.com/elastic/otel-profiling-agent/tracer"

log "github.com/sirupsen/logrus"
Expand Down Expand Up @@ -333,8 +334,22 @@ func mainWithExitCode() exitCode {
// Start reporter metric reporting with 60 second intervals.
defer reportermetrics.Start(mainCtx, rep, 60*time.Second)()

uploader := symbolication.NewNoopUploader()

ddSymbolUpload := os.Getenv("DD_EXPERIMENTAL_LOCAL_SYMBOL_UPLOAD")
if ddSymbolUpload == "true" {
log.Infof("Enabling Datadog local symbol upload")
uploader, err = symbolication.NewDatadogUploader()
if err != nil {
log.Errorf(
"Failed to create Datadog symbol uploader, symbol upload will be disabled: %v",
err,
)
}
}

// Load the eBPF code and map definitions
trc, err := tracer.NewTracer(mainCtx, rep, times, includeTracers, !argSendErrorFrames)
trc, err := tracer.NewTracer(mainCtx, rep, uploader, times, includeTracers, !argSendErrorFrames)
if err != nil {
msg := fmt.Sprintf("Failed to load eBPF tracer: %s", err)
log.Error(msg)
Expand Down
37 changes: 28 additions & 9 deletions processmanager/execinfomanager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,11 @@
package execinfomanager

import (
"context"
"errors"
"fmt"
"os"
"time"

"github.com/elastic/otel-profiling-agent/config"
"github.com/elastic/otel-profiling-agent/host"
Expand All @@ -29,6 +31,7 @@ import (
"github.com/elastic/otel-profiling-agent/metrics"
pmebpf "github.com/elastic/otel-profiling-agent/processmanager/ebpf"
"github.com/elastic/otel-profiling-agent/support"
"github.com/elastic/otel-profiling-agent/symbolication"
"github.com/elastic/otel-profiling-agent/tpbase"
log "github.com/sirupsen/logrus"
"go.uber.org/multierr"
Expand Down Expand Up @@ -72,13 +75,17 @@ type ExecutableInfoManager struct {
// sdp allows fetching stack deltas for executables.
sdp nativeunwind.StackDeltaProvider

// uploader is used to upload symbolication data.
uploader symbolication.Uploader

// state bundles up all mutable state of the manager.
state xsync.RWMutex[executableInfoManagerState]
}

// NewExecutableInfoManager creates a new instance of the executable info manager.
func NewExecutableInfoManager(
sdp nativeunwind.StackDeltaProvider,
uploader symbolication.Uploader,
ebpf pmebpf.EbpfHandler,
includeTracers []bool,
) *ExecutableInfoManager {
Expand All @@ -104,7 +111,8 @@ func NewExecutableInfoManager(
}

return &ExecutableInfoManager{
sdp: sdp,
sdp: sdp,
uploader: uploader,
state: xsync.NewRWMutex(executableInfoManagerState{
interpreterLoaders: interpreterLoaders,
executables: map[host.FileID]*entry{},
Expand All @@ -119,7 +127,7 @@ func NewExecutableInfoManager(
//
// The return value is copied instead of returning a pointer in order to spare us the use
// of getters and more complicated locking semantics.
func (mgr *ExecutableInfoManager) AddOrIncRef(fileID host.FileID,
func (mgr *ExecutableInfoManager) AddOrIncRef(hostFileID host.FileID, fileID libpf.FileID,
elfRef *pfelf.Reference) (ExecutableInfo, error) {
var (
intervalData sdtypes.IntervalData
Expand All @@ -131,7 +139,7 @@ func (mgr *ExecutableInfoManager) AddOrIncRef(fileID host.FileID,

// Fast path for executable info that is already present.
state := mgr.state.WLock()
info, ok := state.executables[fileID]
info, ok := state.executables[hostFileID]
if ok {
defer mgr.state.WUnlock(&state)
info.rc++
Expand All @@ -142,7 +150,7 @@ func (mgr *ExecutableInfoManager) AddOrIncRef(fileID host.FileID,
// so we release the lock before doing this.
mgr.state.WUnlock(&state)

if err = mgr.sdp.GetIntervalStructuresForFile(fileID, elfRef, &intervalData); err != nil {
if err = mgr.sdp.GetIntervalStructuresForFile(hostFileID, elfRef, &intervalData); err != nil {
return ExecutableInfo{}, fmt.Errorf("failed to extract interval data: %w", err)
}

Expand All @@ -156,20 +164,19 @@ func (mgr *ExecutableInfoManager) AddOrIncRef(fileID host.FileID,
// Re-take the lock and check whether another thread beat us to
// inserting the data while we were waiting for the write lock.
state = mgr.state.WLock()
defer mgr.state.WUnlock(&state)
if info, ok = state.executables[fileID]; ok {
if info, ok = state.executables[hostFileID]; ok {
info.rc++
return info.ExecutableInfo, nil
}

// Load the data into BPF maps.
ref, gaps, err = state.loadDeltas(fileID, intervalData.Deltas)
ref, gaps, err = state.loadDeltas(hostFileID, intervalData.Deltas)
if err != nil {
return ExecutableInfo{}, fmt.Errorf("failed to load deltas: %w", err)
}

// Create the LoaderInfo for interpreter detection
loaderInfo := interpreter.NewLoaderInfo(fileID, elfRef, gaps)
loaderInfo := interpreter.NewLoaderInfo(hostFileID, elfRef, gaps)

// Insert a corresponding record into our map.
info = &entry{
Expand All @@ -180,7 +187,19 @@ func (mgr *ExecutableInfoManager) AddOrIncRef(fileID host.FileID,
mapRef: ref,
rc: 1,
}
state.executables[fileID] = info
state.executables[hostFileID] = info
mgr.state.WUnlock(&state)

// Processing symbols for upload can take a while, so we release the lock
// before doing this.
// We also use a timeout to avoid blocking the process manager for too long.
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()

err = mgr.uploader.HandleExecutable(ctx, elfRef, fileID)
if err != nil {
log.Errorf("Failed to handle executable %v: %v", elfRef.FileName(), err)
}

return info.ExecutableInfo, nil
}
Expand Down
6 changes: 4 additions & 2 deletions processmanager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@ import (
pmebpf "github.com/elastic/otel-profiling-agent/processmanager/ebpf"
eim "github.com/elastic/otel-profiling-agent/processmanager/execinfomanager"
"github.com/elastic/otel-profiling-agent/reporter"
"github.com/elastic/otel-profiling-agent/symbolication"
)

const (
Expand Down Expand Up @@ -63,7 +64,8 @@ var (
// the default implementation.
func New(ctx context.Context, includeTracers []bool, monitorInterval time.Duration,
ebpf pmebpf.EbpfHandler, fileIDMapper FileIDMapper, symbolReporter reporter.SymbolReporter,
sdp nativeunwind.StackDeltaProvider, filterErrorFrames bool) (*ProcessManager, error) {
uploader symbolication.Uploader, sdp nativeunwind.StackDeltaProvider,
filterErrorFrames bool) (*ProcessManager, error) {
if fileIDMapper == nil {
var err error
fileIDMapper, err = newFileIDMapper(lruFileIDCacheSize)
Expand All @@ -79,7 +81,7 @@ func New(ctx context.Context, includeTracers []bool, monitorInterval time.Durati
}
elfInfoCache.SetLifetime(elfInfoCacheTTL)

em := eim.NewExecutableInfoManager(sdp, ebpf, includeTracers)
em := eim.NewExecutableInfoManager(sdp, uploader, ebpf, includeTracers)

interpreters := make(map[libpf.PID]map[libpf.OnDiskFileIdentifier]interpreter.Instance)

Expand Down
4 changes: 4 additions & 0 deletions processmanager/manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@ import (
"github.com/elastic/otel-profiling-agent/lpm"
"github.com/elastic/otel-profiling-agent/metrics"
pmebpf "github.com/elastic/otel-profiling-agent/processmanager/ebpf"
"github.com/elastic/otel-profiling-agent/symbolication"
)

// dummyProcess implements pfelf.Process for testing purposes
Expand Down Expand Up @@ -302,6 +303,7 @@ func TestInterpreterConvertTrace(t *testing.T) {
nil,
nil,
nil,
nil,
true)
if err != nil {
t.Fatalf("Failed to initialize new process manager: %v", err)
Expand Down Expand Up @@ -400,6 +402,7 @@ func TestNewMapping(t *testing.T) {
ebpfMockup,
NewMapFileIDMapper(),
nil,
symbolication.NewNoopUploader(),
&dummyProvider,
true)
if err != nil {
Expand Down Expand Up @@ -598,6 +601,7 @@ func TestProcExit(t *testing.T) {
ebpfMockup,
NewMapFileIDMapper(),
nil,
symbolication.NewNoopUploader(),
&dummyProvider,
true)
if err != nil {
Expand Down
9 changes: 8 additions & 1 deletion processmanager/processinfo.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,8 +216,15 @@ func (pm *ProcessManager) handleNewInterpreter(pr process.Process, m *Mapping,
// handleNewMapping processes new file backed mappings
func (pm *ProcessManager) handleNewMapping(pr process.Process, m *Mapping,
elfRef *pfelf.Reference) error {
fileID, ok := pm.FileIDMapper.Get(m.FileID)
if !ok {
log.Debugf("file ID lookup failed for PID %d, file ID %d",
pr.PID(), m.FileID)
fileID = libpf.UnsymbolizedFileID
}

// Resolve executable info first
ei, err := pm.eim.AddOrIncRef(m.FileID, elfRef)
ei, err := pm.eim.AddOrIncRef(m.FileID, fileID, elfRef)
if err != nil {
return err
}
Expand Down
Loading

0 comments on commit 941648c

Please sign in to comment.