Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(nvidia/error-xid-sxid): new component based on persistent xid, sxid event history #157

Merged
merged 1 commit into from
Nov 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions components/accelerator/nvidia/bad-envs/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, bad_envs_id.Name)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, bad_envs_id.Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
}
}

Expand Down
5 changes: 3 additions & 2 deletions components/accelerator/nvidia/clock-speed/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, Name)
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
}
}

Expand Down
5 changes: 3 additions & 2 deletions components/accelerator/nvidia/clock/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, Name)
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
}
}

Expand Down
5 changes: 3 additions & 2 deletions components/accelerator/nvidia/ecc/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, Name)
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
}
}

Expand Down
128 changes: 128 additions & 0 deletions components/accelerator/nvidia/error-xid-sxid/component.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
// Package errorxidsxid implements NVIDIA GPU driver Xid/SXid error detector.
package errorxidsxid

import (
"context"
"database/sql"
"fmt"
"strconv"
"time"

"github.com/dustin/go-humanize"
"github.com/leptonai/gpud/components"
nvidia_error_xid_sxid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error-xid-sxid/id"
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query"
nvidia_xid_sxid_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid-sxid-state"
"github.com/leptonai/gpud/components/query"
"github.com/leptonai/gpud/log"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

// this starts the Xid poller via "nvml.StartDefaultInstance"
cctx, ccancel := context.WithCancel(ctx)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_error_xid_sxid_id.Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.GetDefaultPoller(),
db: cfg.Query.State.DB,
}
}

var _ components.Component = (*component)(nil)

type component struct {
rootCtx context.Context
cancel context.CancelFunc
poller query.Poller
db *sql.DB
}

func (c *component) Name() string { return nvidia_error_xid_sxid_id.Name }

func (c *component) States(ctx context.Context) ([]components.State, error) {
return nil, nil
}

const (
EventNameErroXid = "error_xid"
EventNameErroSXid = "error_sxid"

EventKeyUnixSeconds = "unix_seconds"
EventKeyData = "data"
EventKeyEncoding = "encoding"
EventValueEncodingJSON = "json"
)

func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) {
events, err := nvidia_xid_sxid_state.ReadEvents(ctx, c.db, nvidia_xid_sxid_state.WithSince(since))
if err != nil {
return nil, err
}

convertedEvents := make([]components.Event, 0, len(events))
for _, event := range events {
if xidDetail := event.ToXidDetail(); xidDetail != nil {
msg := fmt.Sprintf("xid %d detected by %s (%s)",
event.EventID,
event.DataSource,
humanize.Time(time.Unix(event.UnixSeconds, 0)),
)
xidBytes, _ := xidDetail.JSON()

convertedEvents = append(convertedEvents, components.Event{
Time: metav1.Time{Time: time.Unix(event.UnixSeconds, 0)},
Name: EventNameErroXid,
Message: msg,
ExtraInfo: map[string]string{
EventKeyUnixSeconds: strconv.FormatInt(event.UnixSeconds, 10),
EventKeyData: string(xidBytes),
EventKeyEncoding: EventValueEncodingJSON,
},
})
continue
}

if sxidDetail := event.ToSXidDetail(); sxidDetail != nil {
msg := fmt.Sprintf("sxid %d detected by %s (%s)",
event.EventID,
event.DataSource,
humanize.Time(time.Unix(event.UnixSeconds, 0)),
)
sxidBytes, _ := sxidDetail.JSON()

convertedEvents = append(convertedEvents, components.Event{
Time: metav1.Time{Time: time.Unix(event.UnixSeconds, 0)},
Name: EventNameErroSXid,
Message: msg,
ExtraInfo: map[string]string{
EventKeyUnixSeconds: strconv.FormatInt(event.UnixSeconds, 10),
EventKeyData: string(sxidBytes),
EventKeyEncoding: EventValueEncodingJSON,
},
})
continue
}
}
return convertedEvents, nil
}

func (c *component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error) {
log.Logger.Debugw("querying metrics", "since", since)

return nil, nil
}

func (c *component) Close() error {
log.Logger.Debugw("closing component")

// safe to call stop multiple times
_ = c.poller.Stop(nvidia_error_xid_sxid_id.Name)

return nil
}
32 changes: 32 additions & 0 deletions components/accelerator/nvidia/error-xid-sxid/config.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
package errorxidsxid

import (
"database/sql"
"encoding/json"

query_config "github.com/leptonai/gpud/components/query/config"
)

type Config struct {
Query query_config.Config `json:"query"`
}

func ParseConfig(b any, db *sql.DB) (*Config, error) {
raw, err := json.Marshal(b)
if err != nil {
return nil, err
}
cfg := new(Config)
err = json.Unmarshal(raw, cfg)
if err != nil {
return nil, err
}
if cfg.Query.State != nil {
cfg.Query.State.DB = db
}
return cfg, nil
}

func (cfg Config) Validate() error {
return nil
}
4 changes: 4 additions & 0 deletions components/accelerator/nvidia/error-xid-sxid/id/id.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
// Package id is the identifier for the nvidia error xid sxid component.
package id

const Name = "accelerator-nvidia-error-xid-sxid"
5 changes: 3 additions & 2 deletions components/accelerator/nvidia/error/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, Name)
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
}
}

Expand Down
5 changes: 3 additions & 2 deletions components/accelerator/nvidia/fabric-manager/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@ func New(ctx context.Context, cfg Config) (components.Component, error) {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, Name)
nvidia_query.SetDefaultPoller(cfg.Log.Query.State.DB)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

if err := cfg.Log.Validate(); err != nil {
ccancel()
Expand All @@ -38,7 +39,7 @@ func New(ctx context.Context, cfg Config) (components.Component, error) {
return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
logPoller: fabric_manager_log.GetDefaultPoller(),
}, nil
}
Expand Down
14 changes: 12 additions & 2 deletions components/accelerator/nvidia/fabric-manager/component_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,11 @@ import (
"testing"
"time"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"

query_config "github.com/leptonai/gpud/components/query/config"
query_log_config "github.com/leptonai/gpud/components/query/log/config"
"github.com/leptonai/gpud/pkg/sqlite"

metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func TestComponentLog(t *testing.T) {
Expand All @@ -28,13 +29,22 @@ func TestComponentLog(t *testing.T) {
ctx, cancel := context.WithCancel(context.Background())
defer cancel()

db, err := sqlite.Open(":memory:")
if err != nil {
t.Fatalf("failed to open database: %v", err)
}
defer db.Close()

pollInterval := 3 * time.Second
component, err := New(
ctx,
Config{
Log: query_log_config.Config{
Query: query_config.Config{
Interval: metav1.Duration{Duration: pollInterval},
State: &query_config.State{
DB: db,
},
},
BufferSize: query_log_config.DefaultBufferSize,
File: f.Name(),
Expand Down
4 changes: 2 additions & 2 deletions components/accelerator/nvidia/gsp-firmware-mode/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,12 +17,12 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, nvidia_gsp_firmware_mode_id.Name)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_gsp_firmware_mode_id.Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
}
}

Expand Down
5 changes: 3 additions & 2 deletions components/accelerator/nvidia/infiniband/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -19,12 +19,13 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, Name)
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
}
}

Expand Down
5 changes: 3 additions & 2 deletions components/accelerator/nvidia/info/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,12 +18,13 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, Name)
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
}
}

Expand Down
5 changes: 3 additions & 2 deletions components/accelerator/nvidia/memory/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, Name)
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
}
}

Expand Down
4 changes: 2 additions & 2 deletions components/accelerator/nvidia/nccl/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,12 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, nvidia_nccl_id.Name)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, nvidia_nccl_id.Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
}
}

Expand Down
5 changes: 3 additions & 2 deletions components/accelerator/nvidia/nvlink/component.go
Original file line number Diff line number Diff line change
Expand Up @@ -22,12 +22,13 @@ func New(ctx context.Context, cfg Config) components.Component {
cfg.Query.SetDefaultsIfNotSet()

cctx, ccancel := context.WithCancel(ctx)
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, Name)
nvidia_query.SetDefaultPoller(cfg.Query.State.DB)
nvidia_query.GetDefaultPoller().Start(cctx, cfg.Query, Name)

return &component{
rootCtx: ctx,
cancel: ccancel,
poller: nvidia_query.DefaultPoller,
poller: nvidia_query.GetDefaultPoller(),
}
}

Expand Down
Loading