From 1ded6c9278ade3fd3474f4eeb720a001a97aa20d Mon Sep 17 00:00:00 2001 From: Fabio Falzoi Date: Tue, 17 Oct 2023 18:17:45 +0200 Subject: [PATCH] sysdump: Add gops trace data Add the ability to scrape trace data from gops. See https://pkg.go.dev/runtime/trace for more information about the Go execution tracer. The overhead derived from enabling the Go execution tracer, while usually low and suitable for production use, might be higher than collecting pprof data, so the option is disabled by default. Signed-off-by: Fabio Falzoi --- internal/cli/cmd/sysdump.go | 3 ++ sysdump/constants.go | 1 + sysdump/defaults.go | 1 + sysdump/sysdump.go | 55 +++++++++++++++++++++++++++++++++++++ 4 files changed, 60 insertions(+) diff --git a/internal/cli/cmd/sysdump.go b/internal/cli/cmd/sysdump.go index 966d047ad9..16c5d2fd22 100644 --- a/internal/cli/cmd/sysdump.go +++ b/internal/cli/cmd/sysdump.go @@ -98,6 +98,9 @@ func initSysdumpFlags(cmd *cobra.Command, options *sysdump.Options, optionPrefix cmd.Flags().BoolVar(&options.Profiling, optionPrefix+"profiling", sysdump.DefaultProfiling, "Whether to enable scraping profiling data") + cmd.Flags().BoolVar(&options.Tracing, + optionPrefix+"tracing", sysdump.DefaultTracing, + "Whether to enable scraping tracing data") cmd.Flags().StringArrayVar(&options.ExtraLabelSelectors, optionPrefix+"extra-label-selectors", nil, "Optional set of labels selectors used to target additional pods for log collection.") diff --git a/sysdump/constants.go b/sysdump/constants.go index 6e6d087c19..ccd69b50e3 100644 --- a/sysdump/constants.go +++ b/sysdump/constants.go @@ -150,6 +150,7 @@ var ( "pprof-heap", "pprof-cpu", } + gopsTrace = "trace" // Gateway API resource group versions used for sysdumping these gatewayClass = schema.GroupVersionResource{ diff --git a/sysdump/defaults.go b/sysdump/defaults.go index 84d2586372..0452cba082 100644 --- a/sysdump/defaults.go +++ b/sysdump/defaults.go @@ -24,6 +24,7 @@ const ( DefaultCiliumSpireServerLabelSelector = "app=spire-server" DefaultDebug = false DefaultProfiling = true + DefaultTracing = false DefaultHubbleLabelSelector = labelPrefix + "hubble" DefaultHubbleFlowsCount = 10000 DefaultHubbleFlowsTimeout = 5 * time.Second diff --git a/sysdump/sysdump.go b/sysdump/sysdump.go index 6677136de4..184fe20697 100644 --- a/sysdump/sysdump.go +++ b/sysdump/sysdump.go @@ -67,6 +67,8 @@ type Options struct { Debug bool // Whether to enable scraping profiling data. Profiling bool + // Whether to enable scraping tracing data. + Tracing bool // The labels used to target additional pods ExtraLabelSelectors []string // The labels used to target Hubble pods. @@ -1292,6 +1294,21 @@ func (c *Collector) Run() error { if c.Options.CiliumNamespace != "" && c.Options.CiliumOperatorNamespace != "" { tasks = append(tasks, ciliumTasks...) + + serialTasks = append(serialTasks, Task{ + CreatesSubtasks: true, + Description: "Collecting tracing data from Cilium pods", + Quick: false, + Task: func(ctx context.Context) error { + if !c.Options.Tracing { + return nil + } + if err := c.SubmitTracingGopsSubtask(c.CiliumPods, ciliumAgentContainerName); err != nil { + return fmt.Errorf("failed to collect tracing data from Cilium pods: %w", err) + } + return nil + }, + }) } tetragonTasks := []Task{ @@ -2114,6 +2131,44 @@ func (c *Collector) SubmitProfilingGopsSubtasks(pods []*corev1.Pod, containerNam return nil } +// SubmitTracingGopsSubtask submits task to collect tracing data from pods. +func (c *Collector) SubmitTracingGopsSubtask(pods []*corev1.Pod, containerName string) error { + for _, p := range pods { + p := p + if err := c.Pool.Submit(fmt.Sprintf("gops-%s-%s", p.Name, gopsTrace), func(ctx context.Context) error { + agentPID, err := c.getGopsPID(ctx, p, containerName) + if err != nil { + return err + } + o, err := c.Client.ExecInPod(ctx, p.Namespace, p.Name, containerName, []string{ + gopsCommand, + gopsTrace, + agentPID, + }) + if err != nil { + return fmt.Errorf("failed to collect gops trace for %q (%q) in namespace %q: %w", p.Name, containerName, p.Namespace, err) + } + filePath, err := extractGopsProfileData(o.String()) + if err != nil { + return fmt.Errorf("failed to collect gops trace for %q (%q) in namespace %q: %w", p.Name, containerName, p.Namespace, err) + } + f := c.AbsoluteTempPath(fmt.Sprintf("%s-%s-.trace", p.Name, gopsTrace)) + err = c.Client.CopyFromPod(ctx, p.Namespace, p.Name, containerName, filePath, f, c.Options.CopyRetryLimit) + if err != nil { + return fmt.Errorf("failed to collect gops trace output for %q: %w", p.Name, err) + } + if _, err = c.Client.ExecInPod(ctx, p.Namespace, p.Name, containerName, []string{rmCommand, filePath}); err != nil { + c.logWarn("failed to delete trace output from pod %q in namespace %q: %w", p.Name, p.Namespace, err) + return nil + } + return nil + }); err != nil { + return fmt.Errorf("failed to submit %s gops task for %q: %w", gopsTrace, p.Name, err) + } + } + return nil +} + // SubmitLogsTasks submits tasks to collect kubernetes logs from pods. func (c *Collector) SubmitLogsTasks(pods []*corev1.Pod, since time.Duration, limitBytes int64) error { t := time.Now().Add(-since)