Skip to content

Commit

Permalink
uploader: more exhaustive search for debug symbols
Browse files Browse the repository at this point in the history
uploader: add support for dry run upload

this is useful for testing locally and getting a good grasp at how many
binaries we will be uploading when the local symbol upload feature is
active

pfelf: refactor debug links support

add OpenDebugBuildID method and refactor OpenDebugLink method so that
it's compatible with https://sourceware.org/gdb/current/onlinedocs/gdb.html/Separate-Debug-Files.html

this is the way that gdb searches for separate debug executables, so it
should hopefully be more exhaustive.

as a side note, -dbgsym packages on ubuntu (for example,
openssh-server-dbgsym) only download symbols to the build ID directory
/usr/lib/debug/.build-id/ (and not the debug link directories), hence
the new method also maximizes the chance to find potential debug symbols
on ubuntu.

uploader: more exhaustive search for debug symbols

take opportunity of the newly added functions in pfelf to look for
debug symbols in /usr/lib/debug/.build-id and more generally in
/usr/lib/debug, which maximizes our chances to find debug symbols
locally.

symbolication: refactor uploader

multiple small refactors to hopefully make the code more robust:
* remove the context in HandleExecutable(), now it should be clearer
which parts are synchrounous, which part are async, and what timeouts
apply to each section
* do symbol extraction (via objcopy) asynchronously, we don't strictly
need it to be synchronous so we do it async which should hopefully
reduce the impact on the process manager run frequency
* group extraction and upload in the same function (to which the async
timeout of 10s by default applies)

in the future it might make sense to make this timeout configurable

uploader: avoid failing if golang binary does not have build ID

some golang executables don't have a build ID, so we don't want to fail
if we can't get the GNU build ID for those executables

uploader: modify interface to not return error

logging an error when the uploader fails to handle an executable turned
out to be relatively noisy, so we modify the interface to no longer
return an error, and we log.Debugf errors we encounter in the uploader

pfelf: fix debug link path

Co-authored-by: Nicolas Savoire <[email protected]>
  • Loading branch information
Gandem and nsavoire committed Jul 1, 2024
1 parent 5ee9584 commit f83ad44
Show file tree
Hide file tree
Showing 6 changed files with 161 additions and 61 deletions.
1 change: 1 addition & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@ services:
environment:
DD_SITE: ${DD_SITE:-datadoghq.com}
DD_EXPERIMENTAL_LOCAL_SYMBOL_UPLOAD: ${DD_EXPERIMENTAL_LOCAL_SYMBOL_UPLOAD:-false}
DD_EXPERIMENTAL_LOCAL_SYMBOL_UPLOAD_DRY_RUN: ${DD_EXPERIMENTAL_LOCAL_SYMBOL_UPLOAD_DRY_RUN:-false}
VERSION: ${VERSION:-local-dev}
volumes:
- .:/agent
Expand Down
51 changes: 47 additions & 4 deletions libpf/pfelf/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,8 @@ const (
var debugStrSectionNames = []string{".debug_str", ".zdebug_str", ".debug_str.dwo"}
var debugInfoSectionNames = []string{".debug_info", ".zdebug_info"}

var globalDebugDirectories = []string{"/usr/lib/debug"}

// ErrSymbolNotFound is returned when requested symbol was not found
var ErrSymbolNotFound = errors.New("symbol not found")

Expand Down Expand Up @@ -604,6 +606,37 @@ func (f *File) insertTLSDescriptorsForSection(descs map[string]libpf.Address,
return nil
}

// OpenDebugBuildID tries to locate and open the corresponding debug ELF for this DSO
// based on its BuildID.
func (f *File) OpenDebugBuildID(elfOpener ELFOpener) (
debugELF *File, debugFile string) {
buildID, err := f.GetBuildID()
if err != nil || len(buildID) < 2 {
return nil, ""
}

// Try to find the debug file
debugDirectories := make([]string, 0, len(globalDebugDirectories))
for _, dir := range globalDebugDirectories {
debugDirectories = append(debugDirectories, filepath.Join(dir, ".build-id"))
}

for _, debugPath := range debugDirectories {
debugFile = filepath.Join(debugPath, buildID[:2], buildID[2:]+".debug")
debugELF, err = elfOpener.OpenELF(debugFile)
if err != nil {
continue
}
debugBuildID, err := debugELF.GetBuildID()
if err != nil || buildID != debugBuildID {
debugELF.Close()
continue
}
return debugELF, debugFile
}
return nil, ""
}

// GetDebugLink reads and parses the .gnu_debuglink section.
// If the link does not exist then ErrNoDebugLink is returned.
func (f *File) GetDebugLink() (linkName string, crc int32, err error) {
Expand All @@ -626,13 +659,23 @@ func (f *File) OpenDebugLink(elfFilePath string, elfOpener ELFOpener) (
linkName, linkCRC32, err := f.GetDebugLink()
if err != nil {
// Treat missing or corrupt tag as soft error.
return
return nil, ""
}

// Try to find the debug file
executablePath := filepath.Dir(elfFilePath)
for _, debugPath := range []string{"/usr/lib/debug/"} {
debugFile = filepath.Join(debugPath, executablePath, linkName)

debugDirectories := []string{
executablePath,
filepath.Join(executablePath, ".debug"),
}
for _, dir := range globalDebugDirectories {
debugDirectories = append(debugDirectories,
filepath.Join(dir, executablePath))
}

for _, debugPath := range debugDirectories {
debugFile = filepath.Join(debugPath, linkName)
debugELF, err = elfOpener.OpenELF(debugFile)
if err != nil {
continue
Expand All @@ -648,7 +691,7 @@ func (f *File) OpenDebugLink(elfFilePath string, elfOpener ELFOpener) (
}
return debugELF, debugFile
}
return
return nil, ""
}

// CRC32 calculates the .gnu_debuglink compatible CRC-32 of the ELF file
Expand Down
13 changes: 2 additions & 11 deletions processmanager/execinfomanager/manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,17 +7,15 @@
package execinfomanager

import (
"context"
"errors"
"fmt"
"os"
"time"

lru "github.com/elastic/go-freelru"
"github.com/elastic/otel-profiling-agent/libpf"
log "github.com/sirupsen/logrus"

lru "github.com/elastic/go-freelru"

"github.com/elastic/otel-profiling-agent/config"
"github.com/elastic/otel-profiling-agent/host"
"github.com/elastic/otel-profiling-agent/interpreter"
Expand Down Expand Up @@ -234,14 +232,7 @@ func (mgr *ExecutableInfoManager) AddOrIncRef(hostFileID host.FileID, fileID lib

// Processing symbols for upload can take a while, so we release the lock
// before doing this.
// We also use a timeout to avoid blocking the process manager for too long.
ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second)
defer cancel()

err = mgr.uploader.HandleExecutable(ctx, elfRef, fileID)
if err != nil {
log.Errorf("Failed to handle executable %v: %v", elfRef.FileName(), err)
}
mgr.uploader.HandleExecutable(elfRef, fileID)

return info.ExecutableInfo, nil
}
Expand Down
146 changes: 109 additions & 37 deletions symbolication/datadog_uploader.go
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@ const binaryCacheSize = 1000

const sourceMapEndpoint = "/api/v2/srcmap"

const uploadTimeout = 10 * time.Second

type DatadogUploader struct {
ddAPIKey string
intakeURL string
dryRun bool

uploadCache *lru.SyncedLRU[libpf.FileID, struct{}]
}
Expand Down Expand Up @@ -59,6 +62,8 @@ func NewDatadogUploader() (Uploader, error) {
return nil, fmt.Errorf("failed to parse URL: %w", err)
}

dryRun := os.Getenv("DD_EXPERIMENTAL_LOCAL_SYMBOL_UPLOAD_DRY_RUN") == "true"

uploadCache, err := lru.NewSynced[libpf.FileID, struct{}](binaryCacheSize, libpf.FileID.Hash32)
if err != nil {
return nil, fmt.Errorf("failed to create cache: %w", err)
Expand All @@ -67,55 +72,42 @@ func NewDatadogUploader() (Uploader, error) {
return &DatadogUploader{
ddAPIKey: ddAPIKey,
intakeURL: intakeURL,
dryRun: dryRun,

uploadCache: uploadCache,
}, nil
}

func (d *DatadogUploader) HandleExecutable(ctx context.Context, elfRef *pfelf.Reference,
fileID libpf.FileID) error {
func (d *DatadogUploader) HandleExecutable(elfRef *pfelf.Reference, fileID libpf.FileID) {
_, ok := d.uploadCache.Peek(fileID)
if ok {
log.Debugf("Skipping symbol upload for executable %s: already uploaded",
elfRef.FileName())
return nil
return
}
fileName := elfRef.FileName()

ef, err := elfRef.GetELF()
// If the ELF file is not found, we ignore it
// This can happen for short-lived processes that are already gone by the time
// we try to upload symbols
if err != nil {
log.Debugf("Skipping symbol upload for executable %s: %v",
fileName, err)
return nil
return
}

// We only upload symbols for executables that have DWARF data
if !ef.HasDWARFData() {
log.Debugf("Skipping symbol upload for executable %s as it does not have DWARF data",
fileName)
return nil
// This needs to be done synchronously before the process manager closes the elfRef
inputFilePath := localDebugSymbolsPath(ef, elfRef)
if inputFilePath == "" {
log.Debugf("Skipping symbol upload for executable %s: no debug symbols found", fileName)
return
}

e, err := newExecutableMetadata(fileName, ef, fileID)
if err != nil {
return err
}

inputFilePath, err := ef.FilePath()
if err != nil {
return fmt.Errorf("failed to get ELF file path: %w", err)
}

symbolFile, err := os.CreateTemp("", "objcopy-debug")
if err != nil {
return fmt.Errorf("failed to create temp file: %w", err)
}

err = d.copySymbols(ctx, inputFilePath, symbolFile.Name())
if err != nil {
return fmt.Errorf("failed to copy symbols: %w", err)
log.Debugf("Skipping symbol upload for executable %s: %v", fileName, err)
return
}

d.uploadCache.Add(fileID, struct{}{})
Expand All @@ -125,18 +117,22 @@ func (d *DatadogUploader) HandleExecutable(ctx context.Context, elfRef *pfelf.Re
// if there are many executables.
// Ideally, we should limit the number of concurrent uploads
go func() {
err = d.uploadSymbols(symbolFile, e)
ctx, cancel := context.WithTimeout(context.Background(), uploadTimeout)
defer cancel()

if d.dryRun {
log.Infof("Dry run: would upload symbols %s for executable: %s", inputFilePath, e)
return
}

err = d.handleSymbols(ctx, inputFilePath, e)
if err != nil {
log.Errorf("Failed to upload symbols: %v for executable: %s", err, e)
d.uploadCache.Remove(fileID)
log.Errorf("Failed to handle symbols: %v for executable: %s", err, e)
} else {
log.Infof("Symbols uploaded successfully for executable: %s", e)
}
symbolFile.Close()
os.Remove(symbolFile.Name())
}()

return nil
}

type executableMetadata struct {
Expand All @@ -152,13 +148,17 @@ type executableMetadata struct {

func newExecutableMetadata(fileName string, elf *pfelf.File,
fileID libpf.FileID) (*executableMetadata, error) {
isGolang := elf.IsGolang()

buildID, err := elf.GetBuildID()
if err != nil {
// Some Go executables don't have a GNU build ID, so we don't want to fail
// if we can't get it
if err != nil && !isGolang {
return nil, fmt.Errorf("failed to get build id: %w", err)
}

goBuildID := ""
if elf.IsGolang() {
if isGolang {
goBuildID, err = elf.GetGoBuildID()
if err != nil {
return nil, fmt.Errorf("failed to get go build id: %w", err)
Expand All @@ -184,6 +184,28 @@ func (e *executableMetadata) String() string {
)
}

func (d *DatadogUploader) handleSymbols(ctx context.Context, symbolPath string,
e *executableMetadata) error {
symbolFile, err := os.CreateTemp("", "objcopy-debug")
if err != nil {
return fmt.Errorf("failed to create temp file to extract symbols: %w", err)
}
defer os.Remove(symbolFile.Name())
defer symbolFile.Close()

err = d.copySymbols(ctx, symbolPath, symbolFile.Name())
if err != nil {
return fmt.Errorf("failed to copy symbols: %w", err)
}

err = d.uploadSymbols(ctx, symbolFile, e)
if err != nil {
return fmt.Errorf("failed to upload symbols: %w", err)
}

return nil
}

func (d *DatadogUploader) copySymbols(ctx context.Context, inputPath, outputPath string) error {
args := []string{
"--only-keep-debug",
Expand All @@ -198,10 +220,8 @@ func (d *DatadogUploader) copySymbols(ctx context.Context, inputPath, outputPath
return nil
}

func (d *DatadogUploader) uploadSymbols(symbolFile *os.File, e *executableMetadata) error {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()

func (d *DatadogUploader) uploadSymbols(ctx context.Context, symbolFile *os.File,
e *executableMetadata) error {
req, err := d.buildSymbolUploadRequest(ctx, symbolFile, e)
if err != nil {
return fmt.Errorf("failed to build symbol upload request: %w", err)
Expand Down Expand Up @@ -279,3 +299,55 @@ func (d *DatadogUploader) buildSymbolUploadRequest(ctx context.Context, symbolFi
r.Header.Set("Content-Encoding", "gzip")
return r, nil
}

// localDebugSymbolsPath returns the path to the local debug symbols for the given ELF file.
func localDebugSymbolsPath(ef *pfelf.File, elfRef *pfelf.Reference) string {
fileName := elfRef.FileName()

filePath, err := debugSymbolsPathForElf(ef, fileName)
if err != nil {
log.Debugf("ELF symbols not found in %s: %v", fileName, err)
} else {
return filePath
}

// Check if there is a separate debug ELF file for this executable
// following the same order as GDB
// https://sourceware.org/gdb/current/onlinedocs/gdb.html/Separate-Debug-Files.html

// First, check based on the GNU build ID
debugElf, debugFile := ef.OpenDebugBuildID(elfRef)
if debugElf != nil {
filePath, err = debugSymbolsPathForElf(debugElf, debugFile)
if err != nil {
log.Debugf("ELF symbols not found in %s: %v", debugFile, err)
} else {
return filePath
}
}

// Then, check based on the debug link
debugElf, debugFile = ef.OpenDebugLink(elfRef.FileName(), elfRef)

if debugElf != nil {
filePath, err = debugSymbolsPathForElf(debugElf, debugFile)
if err != nil {
log.Debugf("ELF symbols not found in %s: %v", debugFile, err)
} else {
return filePath
}
}

return ""
}

func debugSymbolsPathForElf(ef *pfelf.File, fileName string) (string, error) {
filePath, err := ef.FilePath()
if err != nil {
return "", fmt.Errorf("failed to get ELF file path for executable %s: %v", fileName, err)
}
if !ef.HasDWARFData() {
return "", fmt.Errorf("executable %s does not have DWARF data", fileName)
}
return filePath, nil
}
4 changes: 1 addition & 3 deletions symbolication/iface.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,10 @@
package symbolication

import (
"context"

"github.com/elastic/otel-profiling-agent/libpf"
"github.com/elastic/otel-profiling-agent/libpf/pfelf"
)

type Uploader interface {
HandleExecutable(ctx context.Context, elfRef *pfelf.Reference, fileID libpf.FileID) error
HandleExecutable(elfRef *pfelf.Reference, fileID libpf.FileID)
}
7 changes: 1 addition & 6 deletions symbolication/uploader.go
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
package symbolication

import (
"context"

"github.com/elastic/otel-profiling-agent/libpf"
"github.com/elastic/otel-profiling-agent/libpf/pfelf"
)
Expand All @@ -11,10 +9,7 @@ var _ Uploader = (*NoopUploader)(nil)

type NoopUploader struct{}

func (n *NoopUploader) HandleExecutable(_ context.Context, _ *pfelf.Reference,
_ libpf.FileID) error {
return nil
}
func (n *NoopUploader) HandleExecutable(_ *pfelf.Reference, _ libpf.FileID) {}

func NewNoopUploader() Uploader {
return &NoopUploader{}
Expand Down

0 comments on commit f83ad44

Please sign in to comment.