Skip to content

Commit

Permalink
sysdump: Add gops trace data
Browse files Browse the repository at this point in the history
Add the ability to scrape trace data from gops.  See
https://pkg.go.dev/runtime/trace for more information about the Go
execution tracer.

The overhead derived from enabling the Go execution tracer, while
usually low and suitable for production use, might be higher than
collecting pprof data, so the option is disabled by default.

Signed-off-by: Fabio Falzoi <[email protected]>
  • Loading branch information
pippolo84 committed Oct 18, 2023
1 parent 34b3fb3 commit 1ded6c9
Show file tree
Hide file tree
Showing 4 changed files with 60 additions and 0 deletions.
3 changes: 3 additions & 0 deletions internal/cli/cmd/sysdump.go
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,9 @@ func initSysdumpFlags(cmd *cobra.Command, options *sysdump.Options, optionPrefix
cmd.Flags().BoolVar(&options.Profiling,
optionPrefix+"profiling", sysdump.DefaultProfiling,
"Whether to enable scraping profiling data")
cmd.Flags().BoolVar(&options.Tracing,
optionPrefix+"tracing", sysdump.DefaultTracing,
"Whether to enable scraping tracing data")
cmd.Flags().StringArrayVar(&options.ExtraLabelSelectors,
optionPrefix+"extra-label-selectors", nil,
"Optional set of labels selectors used to target additional pods for log collection.")
Expand Down
1 change: 1 addition & 0 deletions sysdump/constants.go
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,7 @@ var (
"pprof-heap",
"pprof-cpu",
}
gopsTrace = "trace"

// Gateway API resource group versions used for sysdumping these
gatewayClass = schema.GroupVersionResource{
Expand Down
1 change: 1 addition & 0 deletions sysdump/defaults.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ const (
DefaultCiliumSpireServerLabelSelector = "app=spire-server"
DefaultDebug = false
DefaultProfiling = true
DefaultTracing = false
DefaultHubbleLabelSelector = labelPrefix + "hubble"
DefaultHubbleFlowsCount = 10000
DefaultHubbleFlowsTimeout = 5 * time.Second
Expand Down
55 changes: 55 additions & 0 deletions sysdump/sysdump.go
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,8 @@ type Options struct {
Debug bool
// Whether to enable scraping profiling data.
Profiling bool
// Whether to enable scraping tracing data.
Tracing bool
// The labels used to target additional pods
ExtraLabelSelectors []string
// The labels used to target Hubble pods.
Expand Down Expand Up @@ -1292,6 +1294,21 @@ func (c *Collector) Run() error {

if c.Options.CiliumNamespace != "" && c.Options.CiliumOperatorNamespace != "" {
tasks = append(tasks, ciliumTasks...)

serialTasks = append(serialTasks, Task{
CreatesSubtasks: true,
Description: "Collecting tracing data from Cilium pods",
Quick: false,
Task: func(ctx context.Context) error {
if !c.Options.Tracing {
return nil
}
if err := c.SubmitTracingGopsSubtask(c.CiliumPods, ciliumAgentContainerName); err != nil {
return fmt.Errorf("failed to collect tracing data from Cilium pods: %w", err)
}
return nil
},
})
}

tetragonTasks := []Task{
Expand Down Expand Up @@ -2114,6 +2131,44 @@ func (c *Collector) SubmitProfilingGopsSubtasks(pods []*corev1.Pod, containerNam
return nil
}

// SubmitTracingGopsSubtask submits task to collect tracing data from pods.
func (c *Collector) SubmitTracingGopsSubtask(pods []*corev1.Pod, containerName string) error {
for _, p := range pods {
p := p
if err := c.Pool.Submit(fmt.Sprintf("gops-%s-%s", p.Name, gopsTrace), func(ctx context.Context) error {
agentPID, err := c.getGopsPID(ctx, p, containerName)
if err != nil {
return err
}
o, err := c.Client.ExecInPod(ctx, p.Namespace, p.Name, containerName, []string{
gopsCommand,
gopsTrace,
agentPID,
})
if err != nil {
return fmt.Errorf("failed to collect gops trace for %q (%q) in namespace %q: %w", p.Name, containerName, p.Namespace, err)
}
filePath, err := extractGopsProfileData(o.String())
if err != nil {
return fmt.Errorf("failed to collect gops trace for %q (%q) in namespace %q: %w", p.Name, containerName, p.Namespace, err)
}
f := c.AbsoluteTempPath(fmt.Sprintf("%s-%s-<ts>.trace", p.Name, gopsTrace))
err = c.Client.CopyFromPod(ctx, p.Namespace, p.Name, containerName, filePath, f, c.Options.CopyRetryLimit)
if err != nil {
return fmt.Errorf("failed to collect gops trace output for %q: %w", p.Name, err)
}
if _, err = c.Client.ExecInPod(ctx, p.Namespace, p.Name, containerName, []string{rmCommand, filePath}); err != nil {
c.logWarn("failed to delete trace output from pod %q in namespace %q: %w", p.Name, p.Namespace, err)
return nil
}
return nil
}); err != nil {
return fmt.Errorf("failed to submit %s gops task for %q: %w", gopsTrace, p.Name, err)
}
}
return nil
}

// SubmitLogsTasks submits tasks to collect kubernetes logs from pods.
func (c *Collector) SubmitLogsTasks(pods []*corev1.Pod, since time.Duration, limitBytes int64) error {
t := time.Now().Add(-since)
Expand Down

0 comments on commit 1ded6c9

Please sign in to comment.