leptonai · gyuho · Sep 30, 2024 · Sep 30, 2024 · Sep 25, 2024 · Sep 25, 2024
diff --git a/components/accelerator/nvidia/info/component_output.go b/components/accelerator/nvidia/info/component_output.go
@@ -31,7 +31,7 @@ func ToOutput(i *nvidia_query.Output) *Output {
 	}
 
 	o := &Output{
-		GPU: GPU{Attached: i.GPUCounts()},
+		GPU: GPU{Attached: i.GPUCount()},
 		Memory: Memory{
 			TotalBytes:     totalMem,
 			TotalHumanized: totalMemHumanized,

diff --git a/components/accelerator/nvidia/memory/component_output.go b/components/accelerator/nvidia/memory/component_output.go
@@ -20,16 +20,20 @@ func ToOutput(i *nvidia_query.Output) *Output {
 	}
 
 	o := &Output{}
-	for _, g := range i.SMI.GPUs {
-		if g.FBMemoryUsage == nil {
-			continue
-		}
-		parsed, err := g.FBMemoryUsage.Parse()
-		if err != nil {
-			continue
+
+	if i.SMI != nil {
+		for _, g := range i.SMI.GPUs {
+			if g.FBMemoryUsage == nil {
+				continue
+			}
+			parsed, err := g.FBMemoryUsage.Parse()
+			if err != nil {
+				continue
+			}
+			o.UsagesSMI = append(o.UsagesSMI, parsed)
 		}
-		o.UsagesSMI = append(o.UsagesSMI, parsed)
 	}
+
 	if i.NVML != nil {
 		for _, device := range i.NVML.DeviceInfos {
 			o.UsagesNVML = append(o.UsagesNVML, device.Memory)

diff --git a/components/accelerator/nvidia/peermem/component_output.go b/components/accelerator/nvidia/peermem/component_output.go
@@ -18,7 +18,7 @@ func ToOutput(i *nvidia_query.Output) *Output {
 
 	o := &Output{
 		LsmodPeermem: *i.LsmodPeermem,
-		GPUCounts:    i.GPUCounts(),
+		GPUCount:     i.GPUCount(),
 		ProductName:  i.GPUProductName(),
 	}
 	return o
@@ -27,7 +27,7 @@ func ToOutput(i *nvidia_query.Output) *Output {
 type Output struct {
 	// Represents the number of GPUs in the system.
 	// This is used to determine if ibcore may be expected to use peermem module.
-	GPUCounts int `json:"gpu_counts"`
+	GPUCount int `json:"gpu_count"`
 
 	ProductName  string                                `json:"product_name"`
 	LsmodPeermem nvidia_query.LsmodPeermemModuleOutput `json:"lsmod_peermem"`
@@ -90,7 +90,7 @@ func (o *Output) States() ([]components.State, error) {
 		// so we don't decide whether peermem is required or not
 		Healthy: true,
 
-		Reason: fmt.Sprintf("ibcore is using peermem module? %v (gpu counts: %d)", o.LsmodPeermem.IbcoreUsingPeermemModule, o.GPUCounts),
+		Reason: fmt.Sprintf("ibcore is using peermem module? %v (gpu counts: %d)", o.LsmodPeermem.IbcoreUsingPeermemModule, o.GPUCount),
 		ExtraInfo: map[string]string{
 			StateKeyLsmodPeermemData:     string(b),
 			StateKeyLsmodPeermemEncoding: StateValueLsmodPeermemEncodingJSON,

diff --git a/components/accelerator/nvidia/query/gpu_memory.go b/components/accelerator/nvidia/query/gpu_memory.go
@@ -0,0 +1,64 @@
+package query
+
+import "strings"
+
+// GetMemoryErrorManagementCapabilities returns the GPU memory error management capabilities
+// based on the GPU product name.
+// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus
+func GetMemoryErrorManagementCapabilities(gpuProductName string) MemoryErrorManagementCapabilities {
+	p := strings.ToLower(gpuProductName)
+	switch {
+	case strings.Contains(p, "h100"):
+		return MemoryErrorManagementCapabilities{
+			ErrorContainment:     true,
+			DynamicPageOfflining: true,
+			RowRemapping:         true,
+		}
+
+	case strings.Contains(p, "a100"):
+		return MemoryErrorManagementCapabilities{
+			ErrorContainment:     true,
+			DynamicPageOfflining: true,
+			RowRemapping:         true,
+		}
+
+	case strings.Contains(p, "a10"):
+		return MemoryErrorManagementCapabilities{
+			RowRemapping: true,
+		}
+
+	default:
+		return MemoryErrorManagementCapabilities{}
+	}
+}
+
+// Contains information about the GPU's memory error management capabilities.
+// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#supported-gpus
+type MemoryErrorManagementCapabilities struct {
+	// (If supported) GPU can limit the impact of uncorrectable ECC errors to GPU applications.
+	// Existing/new workloads will run unaffected, both in terms of accuracy and performance.
+	// Thus, does not require a GPU reset when memory errors occur.
+	//
+	// Note thtat there are some rarer cases, where uncorrectable errors are still uncontained
+	// thus impacting all other workloads being procssed in the GPU.
+	//
+	// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#error-containments
+	ErrorContainment bool `json:"error_containment"`
+
+	// (If supported) GPU can dynamically mark the page containing uncorrectable errors
+	// as unusable, and any existing or new workloads will not be allocating this page.
+	//
+	// Thus, does not require a GPU reset to recover from most uncorrectable ECC errors.
+	//
+	// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#dynamic-page-offlining
+	DynamicPageOfflining bool `json:"dynamic_page_offlining"`
+
+	// (If supported) GPU can replace degrading memory cells with spare ones
+	// to avoid offlining regions of memory. And the row remapping is different
+	// from dynamic page offlining which is fixed at a hardware level.
+	//
+	// The row remapping requires a GPU reset to take effect.
+	//
+	// ref. https://docs.nvidia.com/deploy/a100-gpu-mem-error-mgmt/index.html#row-remapping
+	RowRemapping bool `json:"row_remapping"`
+}
diff --git a/components/accelerator/nvidia/query/gpu_memory_test.go b/components/accelerator/nvidia/query/gpu_memory_test.go
@@ -0,0 +1,63 @@
+package query
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestGetMemoryErrorManagementCapabilities(t *testing.T) {
+	tests := []struct {
+		name           string
+		gpuProductName string
+		expected       MemoryErrorManagementCapabilities
+	}{
+		{
+			name:           "NVIDIA H100 80GB HBM3",
+			gpuProductName: "NVIDIA H100 80GB HBM3",
+			expected: MemoryErrorManagementCapabilities{
+				ErrorContainment:     true,
+				DynamicPageOfflining: true,
+				RowRemapping:         true,
+			},
+		},
+		{
+			name:           "NVIDIA GeForce RTX 4090",
+			gpuProductName: "NVIDIA GeForce RTX 4090",
+			expected:       MemoryErrorManagementCapabilities{},
+		},
+		{
+			name:           "NVIDIA A10",
+			gpuProductName: "NVIDIA A10",
+			expected: MemoryErrorManagementCapabilities{
+				RowRemapping: true,
+			},
+		},
+		{
+			name:           "NVIDIA A100",
+			gpuProductName: "NVIDIA A100",
+			expected: MemoryErrorManagementCapabilities{
+				ErrorContainment:     true,
+				DynamicPageOfflining: true,
+				RowRemapping:         true,
+			},
+		},
+		{
+			name:           "Lowercase input",
+			gpuProductName: "nvidia h100 80gb hbm3",
+			expected: MemoryErrorManagementCapabilities{
+				ErrorContainment:     true,
+				DynamicPageOfflining: true,
+				RowRemapping:         true,
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := GetMemoryErrorManagementCapabilities(tt.gpuProductName)
+			if !reflect.DeepEqual(result, tt.expected) {
+				t.Errorf("GetGPUMemoryErrorManagement(%q) = %v, want %v", tt.gpuProductName, result, tt.expected)
+			}
+		})
+	}
+}
diff --git a/components/accelerator/nvidia/query/metrics/remapped-rows/metrics.go b/components/accelerator/nvidia/query/metrics/remapped-rows/metrics.go
@@ -0,0 +1,152 @@
+// Package remappedrows provides the NVIDIA row remapping metrics collection and reporting.
+package remappedrows
+
+import (
+	"context"
+	"database/sql"
+	"time"
+
+	components_metrics "github.com/leptonai/gpud/components/metrics"
+	components_metrics_state "github.com/leptonai/gpud/components/metrics/state"
+
+	"github.com/prometheus/client_golang/prometheus"
+)
+
+const SubSystem = "accelerator_nvidia_remapped_rows"
+
+var (
+	lastUpdateUnixSeconds = prometheus.NewGauge(
+		prometheus.GaugeOpts{
+			Namespace: "",
+			Subsystem: SubSystem,
+			Name:      "last_update_unix_seconds",
+			Help:      "tracks the last update time in unix seconds",
+		},
+	)
+
+	uncorrectableErrors = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Namespace: "",
+			Subsystem: SubSystem,
+			Name:      "due_to_uncorrectable_errors",
+			Help:      "tracks the number of rows remapped due to uncorrectable errors",
+		},
+		[]string{"gpu_id"},
+	)
+	uncorrectableErrorsAverager = components_metrics.NewNoOpAverager()
+
+	remappingPending = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Namespace: "",
+			Subsystem: SubSystem,
+			Name:      "remapping_pending",
+			Help:      "set to 1 if this GPU requires a reset to actually remap the row",
+		},
+		[]string{"gpu_id"},
+	)
+	remappingPendingAverager = components_metrics.NewNoOpAverager()
+
+	remappingFailed = prometheus.NewGaugeVec(
+		prometheus.GaugeOpts{
+			Namespace: "",
+			Subsystem: SubSystem,
+			Name:      "remapping_failed",
+			Help:      "set to 1 if a remapping has failed in the past",
+		},
+		[]string{"gpu_id"},
+	)
+	remappingFailedAverager = components_metrics.NewNoOpAverager()
+)
+
+func InitAveragers(db *sql.DB, tableName string) {
+	uncorrectableErrorsAverager = components_metrics.NewAverager(db, tableName, SubSystem+"_due_to_uncorrectable_errors")
+	remappingPendingAverager = components_metrics.NewAverager(db, tableName, SubSystem+"_remapping_pending")
+	remappingFailedAverager = components_metrics.NewAverager(db, tableName, SubSystem+"_remapping_failed")
+}
+
+func ReadRemappedDueToUncorrectableErrors(ctx context.Context, since time.Time) (components_metrics_state.Metrics, error) {
+	return uncorrectableErrorsAverager.Read(ctx, components_metrics.WithSince(since))
+}
+
+func ReadRemappingPending(ctx context.Context, since time.Time) (components_metrics_state.Metrics, error) {
+	return remappingPendingAverager.Read(ctx, components_metrics.WithSince(since))
+}
+
+func ReadRemappingFailed(ctx context.Context, since time.Time) (components_metrics_state.Metrics, error) {
+	return remappingFailedAverager.Read(ctx, components_metrics.WithSince(since))
+}
+
+func SetLastUpdateUnixSeconds(unixSeconds float64) {
+	lastUpdateUnixSeconds.Set(unixSeconds)
+}
+
+func SetRemappedDueToUncorrectableErrors(ctx context.Context, gpuID string, cnt uint32, currentTime time.Time) error {
+	uncorrectableErrors.WithLabelValues(gpuID).Set(float64(cnt))
+
+	if err := uncorrectableErrorsAverager.Observe(
+		ctx,
+		float64(cnt),
+		components_metrics.WithCurrentTime(currentTime),
+		components_metrics.WithMetricSecondaryName(gpuID),
+	); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func SetRemappingPending(ctx context.Context, gpuID string, pending bool, currentTime time.Time) error {
+	v := float64(0)
+	if pending {
+		v = float64(1)
+	}
+	remappingPending.WithLabelValues(gpuID).Set(v)
+
+	if err := remappingPendingAverager.Observe(
+		ctx,
+		v,
+		components_metrics.WithCurrentTime(currentTime),
+		components_metrics.WithMetricSecondaryName(gpuID),
+	); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func SetRemappingFailed(ctx context.Context, gpuID string, failed bool, currentTime time.Time) error {
+	v := float64(0)
+	if failed {
+		v = float64(1)
+	}
+	remappingFailed.WithLabelValues(gpuID).Set(v)
+
+	if err := remappingFailedAverager.Observe(
+		ctx,
+		v,
+		components_metrics.WithCurrentTime(currentTime),
+		components_metrics.WithMetricSecondaryName(gpuID),
+	); err != nil {
+		return err
+	}
+
+	return nil
+}
+
+func Register(reg *prometheus.Registry, db *sql.DB, tableName string) error {
+	InitAveragers(db, tableName)
+
+	if err := reg.Register(lastUpdateUnixSeconds); err != nil {
+		return err
+	}
+	if err := reg.Register(uncorrectableErrors); err != nil {
+		return err
+	}
+	if err := reg.Register(remappingPending); err != nil {
+		return err
+	}
+	if err := reg.Register(remappingFailed); err != nil {
+		return err
+	}
+	return nil
+}