Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(nvidia): track row remapping, RMA/GPU reset status #80

Merged
merged 10 commits into from
Sep 30, 2024
2 changes: 1 addition & 1 deletion components/accelerator/nvidia/info/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@ func ToOutput(i *nvidia_query.Output) *Output {
}

o := &Output{
GPU: GPU{Attached: i.GPUCounts()},
GPU: GPU{Attached: i.GPUCount()},
Memory: Memory{
TotalBytes: totalMem,
TotalHumanized: totalMemHumanized,
Expand Down
20 changes: 12 additions & 8 deletions components/accelerator/nvidia/memory/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,16 +20,20 @@ func ToOutput(i *nvidia_query.Output) *Output {
}

o := &Output{}
for _, g := range i.SMI.GPUs {
if g.FBMemoryUsage == nil {
continue
}
parsed, err := g.FBMemoryUsage.Parse()
if err != nil {
continue

if i.SMI != nil {
for _, g := range i.SMI.GPUs {
if g.FBMemoryUsage == nil {
continue
}
parsed, err := g.FBMemoryUsage.Parse()
if err != nil {
continue
}
o.UsagesSMI = append(o.UsagesSMI, parsed)
}
o.UsagesSMI = append(o.UsagesSMI, parsed)
}

if i.NVML != nil {
for _, device := range i.NVML.DeviceInfos {
o.UsagesNVML = append(o.UsagesNVML, device.Memory)
Expand Down
6 changes: 3 additions & 3 deletions components/accelerator/nvidia/peermem/component_output.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ func ToOutput(i *nvidia_query.Output) *Output {

o := &Output{
LsmodPeermem: *i.LsmodPeermem,
GPUCounts: i.GPUCounts(),
GPUCount: i.GPUCount(),
ProductName: i.GPUProductName(),
}
return o
Expand All @@ -27,7 +27,7 @@ func ToOutput(i *nvidia_query.Output) *Output {
type Output struct {
// Represents the number of GPUs in the system.
// This is used to determine if ibcore may be expected to use peermem module.
GPUCounts int `json:"gpu_counts"`
GPUCount int `json:"gpu_count"`

ProductName string `json:"product_name"`
LsmodPeermem nvidia_query.LsmodPeermemModuleOutput `json:"lsmod_peermem"`
Expand Down Expand Up @@ -90,7 +90,7 @@ func (o *Output) States() ([]components.State, error) {
// so we don't decide whether peermem is required or not
Healthy: true,

Reason: fmt.Sprintf("ibcore is using peermem module? %v (gpu counts: %d)", o.LsmodPeermem.IbcoreUsingPeermemModule, o.GPUCounts),
Reason: fmt.Sprintf("ibcore is using peermem module? %v (gpu counts: %d)", o.LsmodPeermem.IbcoreUsingPeermemModule, o.GPUCount),
ExtraInfo: map[string]string{
StateKeyLsmodPeermemData: string(b),
StateKeyLsmodPeermemEncoding: StateValueLsmodPeermemEncodingJSON,
Expand Down
64 changes: 64 additions & 0 deletions components/accelerator/nvidia/query/gpu_memory.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
package query

import "strings"

// GetMemoryErrorManagementCapabilities returns the GPU memory error management capabilities
// based on the GPU product name.
// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus
func GetMemoryErrorManagementCapabilities(gpuProductName string) MemoryErrorManagementCapabilities {
p := strings.ToLower(gpuProductName)
switch {
case strings.Contains(p, "h100"):
return MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
}

case strings.Contains(p, "a100"):
return MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
}

case strings.Contains(p, "a10"):
return MemoryErrorManagementCapabilities{
RowRemapping: true,
}

default:
return MemoryErrorManagementCapabilities{}
}
}

// Contains information about the GPU's memory error management capabilities.
// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus
type MemoryErrorManagementCapabilities struct {
// (If supported) GPU can limit the impact of uncorrectable ECC errors to GPU applications.
// Existing/new workloads will run unaffected, both in terms of accuracy and performance.
// Thus, does not require a GPU reset when memory errors occur.
//
// Note thtat there are some rarer cases, where uncorrectable errors are still uncontained
// thus impacting all other workloads being procssed in the GPU.
//
// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#error-containments
ErrorContainment bool `json:"error_containment"`

// (If supported) GPU can dynamically mark the page containing uncorrectable errors
// as unusable, and any existing or new workloads will not be allocating this page.
//
// Thus, does not require a GPU reset to recover from most uncorrectable ECC errors.
//
// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#dynamic-page-offlining
DynamicPageOfflining bool `json:"dynamic_page_offlining"`

// (If supported) GPU can replace degrading memory cells with spare ones
// to avoid offlining regions of memory. And the row remapping is different
// from dynamic page offlining which is fixed at a hardware level.
//
// The row remapping requires a GPU reset to take effect.
//
// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#row-remapping
RowRemapping bool `json:"row_remapping"`
}
63 changes: 63 additions & 0 deletions components/accelerator/nvidia/query/gpu_memory_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
package query

import (
"reflect"
"testing"
)

func TestGetMemoryErrorManagementCapabilities(t *testing.T) {
tests := []struct {
name string
gpuProductName string
expected MemoryErrorManagementCapabilities
}{
{
name: "NVIDIA H100 80GB HBM3",
gpuProductName: "NVIDIA H100 80GB HBM3",
expected: MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
},
},
{
name: "NVIDIA GeForce RTX 4090",
gpuProductName: "NVIDIA GeForce RTX 4090",
expected: MemoryErrorManagementCapabilities{},
},
{
name: "NVIDIA A10",
gpuProductName: "NVIDIA A10",
expected: MemoryErrorManagementCapabilities{
RowRemapping: true,
},
},
{
name: "NVIDIA A100",
gpuProductName: "NVIDIA A100",
expected: MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
},
},
{
name: "Lowercase input",
gpuProductName: "nvidia h100 80gb hbm3",
expected: MemoryErrorManagementCapabilities{
ErrorContainment: true,
DynamicPageOfflining: true,
RowRemapping: true,
},
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := GetMemoryErrorManagementCapabilities(tt.gpuProductName)
if !reflect.DeepEqual(result, tt.expected) {
t.Errorf("GetGPUMemoryErrorManagement(%q) = %v, want %v", tt.gpuProductName, result, tt.expected)
}
})
}
}
152 changes: 152 additions & 0 deletions components/accelerator/nvidia/query/metrics/remapped-rows/metrics.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
// Package remappedrows provides the NVIDIA row remapping metrics collection and reporting.
package remappedrows

import (
"context"
"database/sql"
"time"

components_metrics "github.com/leptonai/gpud/components/metrics"
components_metrics_state "github.com/leptonai/gpud/components/metrics/state"

"github.com/prometheus/client_golang/prometheus"
)

const SubSystem = "accelerator_nvidia_remapped_rows"

var (
lastUpdateUnixSeconds = prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: "",
Subsystem: SubSystem,
Name: "last_update_unix_seconds",
Help: "tracks the last update time in unix seconds",
},
)

uncorrectableErrors = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "",
Subsystem: SubSystem,
Name: "due_to_uncorrectable_errors",
Help: "tracks the number of rows remapped due to uncorrectable errors",
},
[]string{"gpu_id"},
)
uncorrectableErrorsAverager = components_metrics.NewNoOpAverager()

remappingPending = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "",
Subsystem: SubSystem,
Name: "remapping_pending",
Help: "set to 1 if this GPU requires a reset to actually remap the row",
},
[]string{"gpu_id"},
)
remappingPendingAverager = components_metrics.NewNoOpAverager()

remappingFailed = prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: "",
Subsystem: SubSystem,
Name: "remapping_failed",
Help: "set to 1 if a remapping has failed in the past",
},
[]string{"gpu_id"},
)
remappingFailedAverager = components_metrics.NewNoOpAverager()
)

func InitAveragers(db *sql.DB, tableName string) {
uncorrectableErrorsAverager = components_metrics.NewAverager(db, tableName, SubSystem+"_due_to_uncorrectable_errors")
remappingPendingAverager = components_metrics.NewAverager(db, tableName, SubSystem+"_remapping_pending")
remappingFailedAverager = components_metrics.NewAverager(db, tableName, SubSystem+"_remapping_failed")
}

func ReadRemappedDueToUncorrectableErrors(ctx context.Context, since time.Time) (components_metrics_state.Metrics, error) {
return uncorrectableErrorsAverager.Read(ctx, components_metrics.WithSince(since))
}

func ReadRemappingPending(ctx context.Context, since time.Time) (components_metrics_state.Metrics, error) {
return remappingPendingAverager.Read(ctx, components_metrics.WithSince(since))
}

func ReadRemappingFailed(ctx context.Context, since time.Time) (components_metrics_state.Metrics, error) {
return remappingFailedAverager.Read(ctx, components_metrics.WithSince(since))
}

func SetLastUpdateUnixSeconds(unixSeconds float64) {
lastUpdateUnixSeconds.Set(unixSeconds)
}

func SetRemappedDueToUncorrectableErrors(ctx context.Context, gpuID string, cnt uint32, currentTime time.Time) error {
uncorrectableErrors.WithLabelValues(gpuID).Set(float64(cnt))

if err := uncorrectableErrorsAverager.Observe(
ctx,
float64(cnt),
components_metrics.WithCurrentTime(currentTime),
components_metrics.WithMetricSecondaryName(gpuID),
); err != nil {
return err
}

return nil
}

func SetRemappingPending(ctx context.Context, gpuID string, pending bool, currentTime time.Time) error {
v := float64(0)
if pending {
v = float64(1)
}
remappingPending.WithLabelValues(gpuID).Set(v)

if err := remappingPendingAverager.Observe(
ctx,
v,
components_metrics.WithCurrentTime(currentTime),
components_metrics.WithMetricSecondaryName(gpuID),
); err != nil {
return err
}

return nil
}

func SetRemappingFailed(ctx context.Context, gpuID string, failed bool, currentTime time.Time) error {
v := float64(0)
if failed {
v = float64(1)
}
remappingFailed.WithLabelValues(gpuID).Set(v)

if err := remappingFailedAverager.Observe(
ctx,
v,
components_metrics.WithCurrentTime(currentTime),
components_metrics.WithMetricSecondaryName(gpuID),
); err != nil {
return err
}

return nil
}

func Register(reg *prometheus.Registry, db *sql.DB, tableName string) error {
InitAveragers(db, tableName)

if err := reg.Register(lastUpdateUnixSeconds); err != nil {
return err
}
if err := reg.Register(uncorrectableErrors); err != nil {
return err
}
if err := reg.Register(remappingPending); err != nil {
return err
}
if err := reg.Register(remappingFailed); err != nil {
return err
}
return nil
}
Loading
Loading