Skip to content

Commit

Permalink
adds infiniband metrics plugin
Browse files Browse the repository at this point in the history
  • Loading branch information
spencermckee committed Jun 3, 2024
1 parent 709ccb0 commit d289b57
Show file tree
Hide file tree
Showing 17 changed files with 671 additions and 2 deletions.
48 changes: 48 additions & 0 deletions docs/metrics/plugins/infiniband.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# `infiniband` (Linux)

Gathers Nvidia Infiniband port counters and debug status parameters from /sys/class/infiniband and /sys/class/net (respectively).

## Metrics

Infiniband Port Counter Statistics

Infiniband Status Parameter Statistics

## Architecture

The plugin uses the following data sources:

1. `/sys/class/infiniband`
2. `/sys/class/net`

### Code Locations

- Plugin code interfacing with the Infiniband driver: *pkg/plugin/infiniband/*

## Label Values for Infiniband Port Counters

Below is a running list of all statistics for Infiniband port counters

- `excessive_buffer_overrun_errors`
- `link_downed`
- `link_error_recovery`
- `local_link_integrity_errors`
- `port_rcv_constraint_errors`
- `port_rcv_data`
- `port_rcv_errors`
- `port_rcv_packets`
- `port_rcv_remote_physical_errors`
- `port_rcv_switch_replay_errors`
- `port_xmit_constraint_errors`
- `port_xmit_data`
- `port_xmit_discards`
- `port_xmit_packets`
- `symbol_error`
- `VL15_dropped`

## Label Values for Infiniband Debug Status Parameters

Below is a running list of all statistics for Infiniband debug status parameters

- `lro_timeout`
- `link_down_reason`
18 changes: 18 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,24 @@ func InitializeMetrics() {
utils.DNSLabels...,
)

// InfiniBand Metrics
InfinibandCounterStats = exporter.CreatePrometheusGaugeVecForMetric(
exporter.DefaultRegistry,
utils.InfinibandCounterStatsName,
infinibandCounterStatsDescription,
utils.StatName,
utils.Device,
utils.Port,
)

InfinibandStatusParams = exporter.CreatePrometheusGaugeVecForMetric(
exporter.DefaultRegistry,
utils.InfinibandStatusParamsName,
infinibandStatusParamsDescription,
utils.StatName,
utils.InterfaceName,
)

isInitialized = true
metricsLogger.Info("Metrics initialized")
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/metrics/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ const (
nodeApiServerHandshakeLatencyDesc = "Histogram depicting latency of the TCP handshake between nodes and Kubernetes API server measured in milliseconds"
dnsRequestCounterDescription = "DNS requests by statistics"
dnsResponseCounterDescription = "DNS responses by statistics"
infinibandCounterStatsDescription = "InfiniBand Counter Statistics"
infinibandStatusParamsDescription = "InfiniBand Status Parameters"

// Control plane metrics
pluginManagerFailedToReconcileCounterDescription = "Number of times the plugin manager failed to reconcile the plugins"
Expand Down Expand Up @@ -86,6 +88,9 @@ var (
// DNS Metrics.
DNSRequestCounter ICounterVec
DNSResponseCounter ICounterVec

InfinibandCounterStats IGaugeVec
InfinibandStatusParams IGaugeVec
)

func ToPrometheusType(metric interface{}) prometheus.Collector {
Expand Down
11 changes: 11 additions & 0 deletions pkg/plugin/infiniband/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
REPO_ROOT = $(shell git rev-parse --show-toplevel)
TOOLS_BIN_DIR = $(REPO_ROOT)/hack/tools/bin
MOCKGEN = $(TOOLS_BIN_DIR)/mockgen

.PHONY: generate

generate: $(MOCKGEN) ## Generate mock clients
$(MOCKGEN) -source=$(REPO_ROOT)/pkg/plugin/infiniband/types_linux.go -copyright_file=$(REPO_ROOT)/pkg/lib/ignore_headers.txt -package=infiniband > infiniband_mock_generated.go

$(MOCKGEN):
@make -C $(REPO_ROOT) $(MOCKGEN)
36 changes: 36 additions & 0 deletions pkg/plugin/infiniband/embedded_filesystem.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
package infiniband

import "testing/fstest"

var embeddedFs = fstest.MapFS{ // nolint unused
"infiniband/mlx5_ib0/ports/1/counters/excessive_buffer_overrun_errors": &fstest.MapFile{
Data: []byte("1"),
},
"infiniband/mlx5_ib0/ports/1/counters/VL15_dropped": &fstest.MapFile{
Data: []byte("1"),
},
"infiniband/mlx5_ib0/ports/2/counters/excessive_buffer_overrun_errors": &fstest.MapFile{
Data: []byte("1"),
},
"infiniband/mlx5_ib0/ports/2/counters/VL15_dropped": &fstest.MapFile{
Data: []byte("1"),
},
"infiniband/mlx5_an0/ports/1/counters/excessive_buffer_overrun_errors": &fstest.MapFile{
Data: []byte("1"),
},
"infiniband/mlx5_an0/ports/1/counters/VL15_dropped": &fstest.MapFile{
Data: []byte("1"),
},
"net/ib0/debug/link_down_reason": &fstest.MapFile{
Data: []byte("1"),
},
"net/ib0/debug/lro_timeout": &fstest.MapFile{
Data: []byte("1"),
},
"net/docker0/debug/link_down_reason": &fstest.MapFile{
Data: []byte("1"),
},
"net/docker0/debug/lro_timeout": &fstest.MapFile{
Data: []byte("1"),
},
}
80 changes: 80 additions & 0 deletions pkg/plugin/infiniband/infiniband_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,80 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

// Package infiniband contains the Retina infiniband plugin. It gathers infiniband statistics and debug status parameters.
package infiniband

import (
"context"
"time"

hubblev1 "github.com/cilium/cilium/pkg/hubble/api/v1"
kcfg "github.com/microsoft/retina/pkg/config"
"github.com/microsoft/retina/pkg/log"
"github.com/microsoft/retina/pkg/plugin/api"
"go.uber.org/zap"
)

// New creates a infiniband plugin.
func New(cfg *kcfg.Config) api.Plugin {
return &infiniband{
cfg: cfg,
l: log.Logger().Named(string(Name)),
}
}

func (ib *infiniband) Name() string {
return string(Name)
}

func (ib *infiniband) Generate(ctx context.Context) error { //nolint //implementing iface
return nil
}

func (ib *infiniband) Compile(ctx context.Context) error { //nolint // implementing iface
return nil
}

func (ib *infiniband) Init() error {
ib.l.Info("Initializing infiniband plugin...")
return nil
}

func (ib *infiniband) Start(ctx context.Context) error {
ib.isRunning = true
return ib.run(ctx)
}

func (ib *infiniband) SetupChannel(ch chan *hubblev1.Event) error { // nolint // impl. iface
ib.l.Warn("Plugin does not support SetupChannel", zap.String("plugin", string(Name)))
return nil
}

func (ib *infiniband) run(ctx context.Context) error {
ib.l.Info("Running infiniband plugin...")
ticker := time.NewTicker(ib.cfg.MetricsInterval)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
ib.l.Info("Context is done, infiniband will stop running")
return nil
case <-ticker.C:
infinibandReader := NewInfinibandReader()
err := infinibandReader.readAndUpdate()
if err != nil {
ib.l.Error("Reading infiniband stats failed", zap.Error(err))
}
}
}
}

func (ib *infiniband) Stop() error {
if !ib.isRunning {
return nil
}
ib.l.Info("Stopping infiniband plugin...")
ib.isRunning = false
return nil
}
77 changes: 77 additions & 0 deletions pkg/plugin/infiniband/infiniband_linux_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

//go:build unit
// +build unit

package infiniband

import (
"context"
"testing"
"time"

kcfg "github.com/microsoft/retina/pkg/config"

"github.com/microsoft/retina/pkg/log"
"github.com/stretchr/testify/require"
"golang.org/x/sync/errgroup"
)

var (
cfgPodLevelEnabled = &kcfg.Config{
MetricsInterval: 1 * time.Second,
EnablePodLevel: true,
}
cfgPodLevelDisabled = &kcfg.Config{
MetricsInterval: 1 * time.Second,
EnablePodLevel: false,
}
)

func TestStop(t *testing.T) {
log.SetupZapLogger(log.GetDefaultLogOpts())
p := &infiniband{
cfg: cfgPodLevelEnabled,
l: log.Logger().Named(string(Name)),
}
err := p.Stop()
if err != nil {
t.Fatalf("Expected no error")
}
if p.isRunning {
t.Fatalf("Expected isRunning to be false")
}

p.isRunning = true
err = p.Stop()
if err != nil {
t.Fatalf("Expected no error")
}
if p.isRunning {
t.Fatalf("Expected isRunning to be false")
}
}

func TestShutdown(t *testing.T) {
log.SetupZapLogger(log.GetDefaultLogOpts())
p := &infiniband{
cfg: &kcfg.Config{
MetricsInterval: 100 * time.Second,
EnablePodLevel: true,
},
l: log.Logger().Named(string(Name)),
}

ctx, cancel := context.WithCancel(context.Background())
g, errctx := errgroup.WithContext(ctx)

g.Go(func() error {
return p.Start(errctx)
})

time.Sleep(1 * time.Second)
cancel()
err := g.Wait()
require.NoError(t, err)
}
10 changes: 10 additions & 0 deletions pkg/plugin/infiniband/infiniband_mock_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit d289b57

Please sign in to comment.