-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(nvidia): persist xid, sxid events
feat(nvidia/query/xid-sxid-state): initial commit feat(server): call create table, purge for xid/sxid events table feat(pkg/sqlite): add Open function feat(server): sqlite in server feat(nvml): persist xid Signed-off-by: Gyuho Lee <[email protected]>
- Loading branch information
Showing
75 changed files
with
1,696 additions
and
291 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
128 changes: 128 additions & 0 deletions
128
components/accelerator/nvidia/error-xid-sxid/component.go
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
// Package errorxidsxid implements NVIDIA GPU driver Xid/SXid error detector. | ||
package errorxidsxid | ||
|
||
import ( | ||
"context" | ||
"database/sql" | ||
"fmt" | ||
"strconv" | ||
"time" | ||
|
||
"github.com/dustin/go-humanize" | ||
"github.com/leptonai/gpud/components" | ||
nvidia_error_xid_sxid_id "github.com/leptonai/gpud/components/accelerator/nvidia/error-xid-sxid/id" | ||
nvidia_query "github.com/leptonai/gpud/components/accelerator/nvidia/query" | ||
nvidia_xid_sxid_state "github.com/leptonai/gpud/components/accelerator/nvidia/query/xid-sxid-state" | ||
"github.com/leptonai/gpud/components/query" | ||
"github.com/leptonai/gpud/log" | ||
|
||
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" | ||
) | ||
|
||
func New(ctx context.Context, cfg Config) components.Component { | ||
cfg.Query.SetDefaultsIfNotSet() | ||
|
||
// this starts the Xid poller via "nvml.StartDefaultInstance" | ||
cctx, ccancel := context.WithCancel(ctx) | ||
nvidia_query.DefaultPoller.Start(cctx, cfg.Query, nvidia_error_xid_sxid_id.Name) | ||
|
||
return &component{ | ||
rootCtx: ctx, | ||
cancel: ccancel, | ||
poller: nvidia_query.DefaultPoller, | ||
db: cfg.Query.State.DB, | ||
} | ||
} | ||
|
||
var _ components.Component = (*component)(nil) | ||
|
||
type component struct { | ||
rootCtx context.Context | ||
cancel context.CancelFunc | ||
poller query.Poller | ||
db *sql.DB | ||
} | ||
|
||
func (c *component) Name() string { return nvidia_error_xid_sxid_id.Name } | ||
|
||
func (c *component) States(ctx context.Context) ([]components.State, error) { | ||
return nil, nil | ||
} | ||
|
||
const ( | ||
EventNameErroXid = "error_xid" | ||
EventNameErroSXid = "error_sxid" | ||
|
||
EventKeyUnixSeconds = "unix_seconds" | ||
EventKeyData = "data" | ||
EventKeyEncoding = "encoding" | ||
EventValueEncodingJSON = "json" | ||
) | ||
|
||
func (c *component) Events(ctx context.Context, since time.Time) ([]components.Event, error) { | ||
events, err := nvidia_xid_sxid_state.ReadEvents(ctx, c.db, nvidia_xid_sxid_state.WithSince(since)) | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
convertedEvents := make([]components.Event, 0, len(events)) | ||
for _, event := range events { | ||
if xidDetail := event.ToXidDetail(); xidDetail != nil { | ||
msg := fmt.Sprintf("xid %d detected by %s (%s)", | ||
event.EventID, | ||
event.DataSource, | ||
humanize.Time(time.Unix(event.UnixSeconds, 0)), | ||
) | ||
xidBytes, _ := xidDetail.JSON() | ||
|
||
convertedEvents = append(convertedEvents, components.Event{ | ||
Time: metav1.Time{Time: time.Unix(event.UnixSeconds, 0)}, | ||
Name: EventNameErroXid, | ||
Message: msg, | ||
ExtraInfo: map[string]string{ | ||
EventKeyUnixSeconds: strconv.FormatInt(event.UnixSeconds, 10), | ||
EventKeyData: string(xidBytes), | ||
EventKeyEncoding: EventValueEncodingJSON, | ||
}, | ||
}) | ||
continue | ||
} | ||
|
||
if sxidDetail := event.ToSXidDetail(); sxidDetail != nil { | ||
msg := fmt.Sprintf("sxid %d detected by %s (%s)", | ||
event.EventID, | ||
event.DataSource, | ||
humanize.Time(time.Unix(event.UnixSeconds, 0)), | ||
) | ||
sxidBytes, _ := sxidDetail.JSON() | ||
|
||
convertedEvents = append(convertedEvents, components.Event{ | ||
Time: metav1.Time{Time: time.Unix(event.UnixSeconds, 0)}, | ||
Name: EventNameErroSXid, | ||
Message: msg, | ||
ExtraInfo: map[string]string{ | ||
EventKeyUnixSeconds: strconv.FormatInt(event.UnixSeconds, 10), | ||
EventKeyData: string(sxidBytes), | ||
EventKeyEncoding: EventValueEncodingJSON, | ||
}, | ||
}) | ||
continue | ||
} | ||
} | ||
return convertedEvents, nil | ||
} | ||
|
||
func (c *component) Metrics(ctx context.Context, since time.Time) ([]components.Metric, error) { | ||
log.Logger.Debugw("querying metrics", "since", since) | ||
|
||
return nil, nil | ||
} | ||
|
||
func (c *component) Close() error { | ||
log.Logger.Debugw("closing component") | ||
|
||
// safe to call stop multiple times | ||
_ = c.poller.Stop(nvidia_error_xid_sxid_id.Name) | ||
|
||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,32 @@ | ||
package errorxidsxid | ||
|
||
import ( | ||
"database/sql" | ||
"encoding/json" | ||
|
||
query_config "github.com/leptonai/gpud/components/query/config" | ||
) | ||
|
||
type Config struct { | ||
Query query_config.Config `json:"query"` | ||
} | ||
|
||
func ParseConfig(b any, db *sql.DB) (*Config, error) { | ||
raw, err := json.Marshal(b) | ||
if err != nil { | ||
return nil, err | ||
} | ||
cfg := new(Config) | ||
err = json.Unmarshal(raw, cfg) | ||
if err != nil { | ||
return nil, err | ||
} | ||
if cfg.Query.State != nil { | ||
cfg.Query.State.DB = db | ||
} | ||
return cfg, nil | ||
} | ||
|
||
func (cfg Config) Validate() error { | ||
return nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
// Package id is the identifier for the nvidia error xid sxid component. | ||
package id | ||
|
||
const Name = "accelerator-nvidia-error-xid-sxid" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
// Package id provides the nvidia error sxid id component. | ||
package id | ||
|
||
const Name = "accelerator-nvidia-error-sxid" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.