Skip to content

Commit

Permalink
adds infiniband metrics plugin
Browse files Browse the repository at this point in the history
Signed-off-by: Spencer McKee <[email protected]>
  • Loading branch information
spencermckee committed Apr 26, 2024
1 parent 44a2a59 commit 902d328
Show file tree
Hide file tree
Showing 29 changed files with 847 additions and 0 deletions.
48 changes: 48 additions & 0 deletions docs/metrics/plugins/infiniband.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# `infiniband` (Linux)

Gathers Nvidia Infiniband port counters and debug status parameters from /sys/class/infiniband and /sys/class/net (respectively).

## Metrics

Infiniband Port Counter Statistics

Infiniband Status Parameter Statistics

## Architecture

The plugin uses the following data sources:

1. `/sys/class/infiniband`
2. `/sys/class/net`

### Code Locations

- Plugin code interfacing with the Infiniband driver: *pkg/plugin/infiniband/*

## Label Values for Infiniband Port Counters

Below is a running list of all statistics for Infiniband port counters

- `excessive_buffer_overrun_errors`
- `link_downed`
- `link_error_recovery`
- `local_link_integrity_errors`
- `port_rcv_constraint_errors`
- `port_rcv_data`
- `port_rcv_errors`
- `port_rcv_packets`
- `port_rcv_remote_physical_errors`
- `port_rcv_switch_replay_errors`
- `port_xmit_constraint_errors`
- `port_xmit_data`
- `port_xmit_discards`
- `port_xmit_packets`
- `symbol_error`
- `VL15_dropped`

## Label Values for Infiniband Debug Status Parameters

Below is a running list of all statistics for Infiniband debug status parameters

- `lro_timeout`
- `link_down_reason`
Empty file.
98 changes: 98 additions & 0 deletions pkg/log/test.log

Large diffs are not rendered by default.

18 changes: 18 additions & 0 deletions pkg/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,24 @@ func InitializeMetrics() {
utils.DNSLabels...,
)

// InfiniBand Metrics
InfinibandCounterStats = exporter.CreatePrometheusGaugeVecForMetric(
exporter.DefaultRegistry,
utils.InfinibandCounterStatsName,
infinibandCounterStatsDescription,
utils.StatName,
utils.Device,
utils.Port,
)

InfinibandStatusParams = exporter.CreatePrometheusGaugeVecForMetric(
exporter.DefaultRegistry,
utils.InfinibandStatusParamsName,
infinibandStatusParamsDescription,
utils.StatName,
utils.InterfaceName,
)

isInitialized = true
metricsLogger.Info("Metrics initialized")
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/metrics/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,8 @@ const (
nodeApiServerHandshakeLatencyDesc = "Histogram depicting latency of the TCP handshake between nodes and Kubernetes API server measured in milliseconds"
dnsRequestCounterDescription = "DNS requests by statistics"
dnsResponseCounterDescription = "DNS responses by statistics"
infinibandCounterStatsDescription = "InfiniBand Counter Statistics"
infinibandStatusParamsDescription = "InfiniBand Status Parameters"

// Control plane metrics
pluginManagerFailedToReconcileCounterDescription = "Number of times the plugin manager failed to reconcile the plugins"
Expand Down Expand Up @@ -86,6 +88,9 @@ var (
// DNS Metrics.
DNSRequestCounter ICounterVec
DNSResponseCounter ICounterVec

InfinibandCounterStats IGaugeVec
InfinibandStatusParams IGaugeVec
)

func ToPrometheusType(metric interface{}) prometheus.Collector {
Expand Down
11 changes: 11 additions & 0 deletions pkg/plugin/infiniband/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
REPO_ROOT = $(shell git rev-parse --show-toplevel)
TOOLS_BIN_DIR = $(REPO_ROOT)/hack/tools/bin
MOCKGEN = $(TOOLS_BIN_DIR)/mockgen

.PHONY: generate

generate: $(MOCKGEN) ## Generate mock clients
$(MOCKGEN) -source=$(REPO_ROOT)/pkg/plugin/infiniband/types_linux.go -copyright_file=$(REPO_ROOT)/pkg/lib/ignore_headers.txt -package=infiniband > infiniband_mock_generated.go

$(MOCKGEN):
@make -C $(REPO_ROOT) $(MOCKGEN)
89 changes: 89 additions & 0 deletions pkg/plugin/infiniband/infiniband_linux.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

// Package infiniband contains the Retina infiniband plugin. It gathers infiniband statistics and debug status parameters.
package infiniband

import (
"context"
"sync"
"time"

hubblev1 "github.com/cilium/cilium/pkg/hubble/api/v1"
kcfg "github.com/microsoft/retina/pkg/config"
"github.com/microsoft/retina/pkg/log"
"github.com/microsoft/retina/pkg/plugin/api"
"go.uber.org/zap"
)

// New creates a infiniband plugin.
func New(cfg *kcfg.Config) api.Plugin {
return &infiniband{
cfg: cfg,
l: log.Logger().Named(string(Name)),
}
}

func (lu *infiniband) Name() string {
return string(Name)
}

func (lu *infiniband) Generate(ctx context.Context) error { //nolint //implementing iface
return nil
}

func (lu *infiniband) Compile(ctx context.Context) error { //nolint // implementing iface
return nil
}

func (lu *infiniband) Init() error {
lu.l.Info("Initializing infiniband plugin...")
return nil
}

func (lu *infiniband) Start(ctx context.Context) error {
lu.isRunning = true
return lu.run(ctx)
}

func (lu *infiniband) SetupChannel(ch chan *hubblev1.Event) error { // nolint // impl. iface
lu.l.Warn("Plugin does not support SetupChannel", zap.String("plugin", string(Name)))
return nil
}

func (lu *infiniband) run(ctx context.Context) error {
lu.l.Info("Running infiniband plugin...")
ticker := time.NewTicker(lu.cfg.MetricsInterval)
defer ticker.Stop()

for {
select {
case <-ctx.Done():
lu.l.Info("Context is done, infiniband will stop running")
return nil
case <-ticker.C:
var wg sync.WaitGroup

infinibandReader := NewInfinibandReader()
wg.Add(1)
go func() {
defer wg.Done()
err := infinibandReader.readAndUpdate()
if err != nil {
lu.l.Error("Reading infiniband stats failed", zap.Error(err))
}
}()

wg.Wait()
}
}
}

func (lu *infiniband) Stop() error {
if !lu.isRunning {
return nil
}
lu.l.Info("Stopping infiniband plugin...")
lu.isRunning = false
return nil
}
77 changes: 77 additions & 0 deletions pkg/plugin/infiniband/infiniband_linux_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT license.

//go:build unit
// +build unit

package infiniband

import (
"context"
"testing"
"time"

kcfg "github.com/microsoft/retina/pkg/config"

"github.com/microsoft/retina/pkg/log"
"github.com/stretchr/testify/require"
"golang.org/x/sync/errgroup"
)

var (
cfgPodLevelEnabled = &kcfg.Config{
MetricsInterval: 1 * time.Second,
EnablePodLevel: true,
}
cfgPodLevelDisabled = &kcfg.Config{
MetricsInterval: 1 * time.Second,
EnablePodLevel: false,
}
)

func TestStop(t *testing.T) {
log.SetupZapLogger(log.GetDefaultLogOpts())
p := &infiniband{
cfg: cfgPodLevelEnabled,
l: log.Logger().Named(string(Name)),
}
err := p.Stop()
if err != nil {
t.Fatalf("Expected no error")
}
if p.isRunning {
t.Fatalf("Expected isRunning to be false")
}

p.isRunning = true
err = p.Stop()
if err != nil {
t.Fatalf("Expected no error")
}
if p.isRunning {
t.Fatalf("Expected isRunning to be false")
}
}

func TestShutdown(t *testing.T) {
log.SetupZapLogger(log.GetDefaultLogOpts())
p := &infiniband{
cfg: &kcfg.Config{
MetricsInterval: 100 * time.Second,
EnablePodLevel: true,
},
l: log.Logger().Named(string(Name)),
}

ctx, cancel := context.WithCancel(context.Background())
g, errctx := errgroup.WithContext(ctx)

g.Go(func() error {
return p.Start(errctx)
})

time.Sleep(1 * time.Second)
cancel()
err := g.Wait()
require.NoError(t, err)
}
120 changes: 120 additions & 0 deletions pkg/plugin/infiniband/infiniband_mock_generated.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit 902d328

Please sign in to comment.