diff --git a/Makefile b/Makefile index 4679c325be..9a458da975 100644 --- a/Makefile +++ b/Makefile @@ -331,8 +331,8 @@ cos-gpu-all-tests: gpu-images cos-gpu-smoke-tests $(RUNTIME_BIN) @$(call sudo,test/gpu:sniffer_test,--runtime=$(RUNTIME) -test.v --cos-gpu $(ARGS)) .PHONY: cos-gpu-all-tests -cuda-tests: load-gpu_cuda-tests $(RUNTIME_BIN) - @$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true) +cuda-tests: load-basic_alpine load-gpu_cuda-tests $(RUNTIME_BIN) + @$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true --nvproxy-allowed-driver-capabilities=all) @$(call sudo,test/gpu:cuda_test,--runtime=$(RUNTIME) -test.v $(ARGS)) .PHONY: cuda-tests diff --git a/images/gpu/cuda-tests/list_features.cu b/images/gpu/cuda-tests/list_features.cu index 060740304c..6f95cf7f1a 100644 --- a/images/gpu/cuda-tests/list_features.cu +++ b/images/gpu/cuda-tests/list_features.cu @@ -49,4 +49,18 @@ int main(int argc, char *argv[]) { CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED, cuda_device)); printFeature("COMPRESSIBLE_MEMORY", isCompressionAvailable != 0); + bool p2pAvailable = false; + int gpuCount = -1; + CHECK_CUDA(cudaGetDeviceCount(&gpuCount)); + printf("// Number of GPUs: %d\n", gpuCount); + if (gpuCount >= 2) { + int canAccessAToB = -1; + CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessAToB, 0, 1)); + printf("// CUDA P2P: 0 -> 1: %d\n", canAccessAToB); + int canAccessBToA = -1; + CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessBToA, 1, 0)); + printf("// CUDA P2P: 1 -> 0: %d\n", canAccessBToA); + p2pAvailable = canAccessAToB > 0 && canAccessBToA > 0; + } + printFeature("P2P", p2pAvailable); } diff --git a/images/gpu/cuda-tests/list_features.sh b/images/gpu/cuda-tests/list_features.sh index 32ec98a103..400f25131b 100644 --- a/images/gpu/cuda-tests/list_features.sh +++ b/images/gpu/cuda-tests/list_features.sh @@ -20,6 +20,13 @@ set -euo pipefail +if [[ "${NVIDIA_DRIVER_CAPABILITIES:-}" != "all" ]]; then + echo "NVIDIA_DRIVER_CAPABILITIES is not set to 'all'." >&2 + echo "It is set to: '${NVIDIA_DRIVER_CAPABILITIES:-}'" >&2 + echo "Please set it to 'all' and try again." >&2 + exit 1 +fi + cd / nvcc list_features.cu -lcuda -o list_features ./list_features diff --git a/images/gpu/cuda-tests/run_sample.go b/images/gpu/cuda-tests/run_sample.go index 1d5e9ca1df..d4ce0bd8ad 100644 --- a/images/gpu/cuda-tests/run_sample.go +++ b/images/gpu/cuda-tests/run_sample.go @@ -997,6 +997,9 @@ func (st *SampleTest) RunLibNVVMTest(ctx context.Context) error { // Main is the main method of this program. func Main(ctx context.Context) (int, error) { flag.Parse() + if nvCaps := os.Getenv("NVIDIA_DRIVER_CAPABILITIES"); nvCaps != "all" { + return 1, fmt.Errorf("NVIDIA_DRIVER_CAPABILITIES is not set to 'all' (got %q); please set it to 'all' and try again", nvCaps) + } cleanupCtx, cleanupCancel := context.WithTimeout(ctx, *timeoutFlag) defer cleanupCancel() deadline, _ := cleanupCtx.Deadline() @@ -1007,15 +1010,15 @@ func Main(ctx context.Context) (int, error) { defer x.Shutdown(cleanupCtx) testsCtx, testsCancel := context.WithDeadline(cleanupCtx, deadline.Add(-10*time.Second)) defer testsCancel() - failed := false numTests := 0 exitCode := 1 + var lastErr error for _, testName := range flag.Args() { numTests++ st, err := NewSampleTest(testName, x) if err != nil { log("> Invalid test %q: %s", testName, err) - failed = true + lastErr = fmt.Errorf("invalid test %q: %w", testName, err) continue } log("> Running test: %s", testName) @@ -1024,7 +1027,7 @@ func Main(ctx context.Context) (int, error) { testCancel() if err != nil { log("> Test failed: %s (%s)", testName, err) - failed = true + lastErr = fmt.Errorf("test %q failed: %w", testName, err) if exitErr := (*exec.ExitError)(nil); errors.As(err, &exitErr) && exitErr.ExitCode() > 0 { exitCode = exitErr.ExitCode() } @@ -1035,14 +1038,19 @@ func Main(ctx context.Context) (int, error) { if numTests == 0 { return 1, fmt.Errorf("no tests to run, failing vacuously; specify test names as positional arguments") } - if failed { - if numTests == 1 { - // If there was a single test to run, pass along its error code. - return exitCode, fmt.Errorf("test failed") - } - return 1, errors.New("one or more tests failed") + if lastErr == nil { + return 0, nil + } + if numTests != 1 { + return 1, fmt.Errorf("one or more tests failed (last error: %w)", lastErr) + } + // If there was a single test to run, pass along its error code if it + // had one. (It may not have had one in case the test failed for another + // reason, e.g. error setting up the test prior to running it.) + if exitCode == 0 { + exitCode = 1 } - return 0, nil + return exitCode, fmt.Errorf("test failed: %w", lastErr) } func main() { diff --git a/pkg/test/dockerutil/container.go b/pkg/test/dockerutil/container.go index f6975150a3..d763f45d45 100644 --- a/pkg/test/dockerutil/container.go +++ b/pkg/test/dockerutil/container.go @@ -1000,7 +1000,8 @@ func (cp *ContainerPool) String() string { sb.WriteString(", ") } status := cp.statuses[container] - sb.WriteString(container.Name) + sb.WriteString("#") + sb.WriteString(strconv.Itoa(i)) sb.WriteString("[") sb.WriteString(status.state.String()) sb.WriteString("]") diff --git a/test/gpu/cuda_test.go b/test/gpu/cuda_test.go index 41832ceabc..0e8709bced 100644 --- a/test/gpu/cuda_test.go +++ b/test/gpu/cuda_test.go @@ -23,6 +23,7 @@ import ( "math" "os" "runtime" + "slices" "strconv" "strings" "sync" @@ -73,9 +74,7 @@ var ( var testCompatibility = map[string]Compatibility{ "0_Introduction/simpleAttributes": RequiresFeatures(FeaturePersistentL2Caching), "0_Introduction/simpleCUDA2GL": RequiresFeatures(FeatureGL), - "0_Introduction/simpleIPC": &BrokenInGVisor{OnlyWhenMultipleGPU: true}, - "0_Introduction/simpleP2P": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), - "0_Introduction/vectorAddMMAP": &BrokenInGVisor{OnlyWhenMultipleGPU: true}, + "0_Introduction/simpleP2P": &RequiresP2P{}, "2_Concepts_and_Techniques/cuHook": &BrokenEverywhere{ Reason: "Requires ancient version of glibc (<=2.33)", }, @@ -90,12 +89,12 @@ var testCompatibility = map[string]Compatibility{ ), "2_Concepts_and_Techniques/EGLSync_CUDAEvent_Interop": &OnlyOnWindows{}, "2_Concepts_and_Techniques/streamOrderedAllocationIPC": &BrokenInGVisor{}, - "2_Concepts_and_Techniques/streamOrderedAllocationP2P": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), + "2_Concepts_and_Techniques/streamOrderedAllocationP2P": &RequiresP2P{}, "3_CUDA_Features/bf16TensorCoreGemm": RequiresFeatures(FeatureTensorCores), "3_CUDA_Features/cdpAdvancedQuicksort": RequiresFeatures(FeatureDynamicParallelism), "3_CUDA_Features/cudaCompressibleMemory": RequiresFeatures(FeatureCompressibleMemory), "3_CUDA_Features/dmmaTensorCoreGemm": RequiresFeatures(FeatureTensorCores), - "3_CUDA_Features/memMapIPCDrv": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), + "3_CUDA_Features/memMapIPCDrv": &RequiresMultiGPU{}, "3_CUDA_Features/tf32TensorCoreGemm": RequiresFeatures(FeatureTensorCores), "4_CUDA_Libraries/conjugateGradientMultiDeviceCG": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), "4_CUDA_Libraries/cudaNvSci": &RequiresNvSci{}, @@ -105,14 +104,14 @@ var testCompatibility = map[string]Compatibility{ "4_CUDA_Libraries/cuDLAStandaloneMode": &OnlyOnWindows{}, "4_CUDA_Libraries/cuDLALayerwiseStatsHybrid": &OnlyOnWindows{}, "4_CUDA_Libraries/cuDLALayerwiseStatsStandalone": &OnlyOnWindows{}, - "4_CUDA_Libraries/simpleCUFFT_2d_MGPU": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), - "4_CUDA_Libraries/simpleCUFFT_MGPU": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}), + "4_CUDA_Libraries/simpleCUFFT_2d_MGPU": &RequiresMultiGPU{}, + "4_CUDA_Libraries/simpleCUFFT_MGPU": &RequiresMultiGPU{}, "5_Domain_Specific/fluidsD3D9": &OnlyOnWindows{}, "5_Domain_Specific/fluidsGL": RequiresFeatures(FeatureGL), "5_Domain_Specific/fluidsGLES": &OnlyOnWindows{}, "5_Domain_Specific/nbody_opengles": &OnlyOnWindows{}, "5_Domain_Specific/nbody_screen": &OnlyOnWindows{}, - "5_Domain_Specific/p2pBandwidthLatencyTest": &BrokenInGVisor{OnlyWhenMultipleGPU: true}, + "5_Domain_Specific/p2pBandwidthLatencyTest": &RequiresP2P{}, "5_Domain_Specific/postProcessGL": RequiresFeatures(FeatureGL), "5_Domain_Specific/simpleD3D10": &OnlyOnWindows{}, "5_Domain_Specific/simpleD3D10RenderTarget": &OnlyOnWindows{}, @@ -133,8 +132,11 @@ var testCompatibility = map[string]Compatibility{ } // flakyTests is a list of tests that are flaky. -// These will be retried up to 3 times in parallel before running serially. -var flakyTests = map[string]struct{}{} +// These will be retried up to 3 times in parallel before running 3 times +// serially. +var flakyTests = map[string]struct{}{ + "3_CUDA_Features/cdpAdvancedQuicksort": {}, +} // exclusiveTests is a list of tests that must run exclusively (i.e. with // no other test running on the machine at the same time), or they will @@ -145,6 +147,13 @@ var flakyTests = map[string]struct{}{} // causing spurious failures for the tests that happen to be running in // parallel with them. var exclusiveTests = map[string]struct{}{ + // Can fail due to + // "launch failed because launch would exceed cudaLimitDevRuntimePendingLaunchCount" + // when running in parallel with other tests. + "3_CUDA_Features/cdpAdvancedQuicksort": {}, + + // Performance-intensive tests that tend to make other concurrent tests + // flake due to their high resource usage. "6_Performance/alignedTypes": {}, "6_Performance/transpose": {}, "6_Performance/UnifiedMemoryPerf": {}, @@ -153,12 +162,7 @@ var exclusiveTests = map[string]struct{}{ // alwaysSkippedTests don't run at all, ever, and are not verified when // --cuda_verify_compatibility is set. // Each test is mapped to a reason why it should be skipped. -var alwaysSkippedTests = map[string]string{ - // These tests seem to flake in gVisor, but consistently within the same - // run of the overall test, so they cannot be included in `flakyTests`. - "0_Introduction/simpleAssert": "Flaky in gVisor", - "0_Introduction/simpleAssert_nvrtc": "Flaky in gVisor", -} +var alwaysSkippedTests = map[string]string{} // Feature is a feature as listed by /list_features.sh. type Feature string @@ -170,6 +174,7 @@ const ( FeatureGL Feature = "GL" FeatureTensorCores Feature = "TENSOR_CORES" FeatureCompressibleMemory Feature = "COMPRESSIBLE_MEMORY" + FeatureP2P Feature = "P2P" ) // allFeatures is a list of all CUDA features above. @@ -179,6 +184,7 @@ var allFeatures = []Feature{ FeatureGL, FeatureTensorCores, FeatureCompressibleMemory, + FeatureP2P, } // TestEnvironment represents the environment in which a sample test runs. @@ -228,10 +234,6 @@ type BrokenInGVisor struct { // This is for tests that can run on a single or multiple GPUs alike, // but specifically fail in gVisor when run with multiple GPUs. OnlyWhenMultipleGPU bool - - // KnownToHang may be set to true for short tests which can hang instead - // of failing. This avoids waiting ~forever for them to finish. - KnownToHang bool } // WillFail implements `Compatibility.WillFail`. @@ -273,6 +275,34 @@ func (*RequiresMultiGPU) IsExpectedFailure(ctx context.Context, env *TestEnviron return nil } +// RequiresMultiGPU implements `Compatibility` for tests that require +// peer-to-peer communication between GPUs. +// Implies RequiresMultiGPU, so tests do not need to specify both. +type RequiresP2P struct{} + +// WillFail implements `Compatibility.WillFail`. +func (*RequiresP2P) WillFail(ctx context.Context, env *TestEnvironment) string { + if notEnoughGPUs := (&RequiresMultiGPU{}).WillFail(ctx, env); notEnoughGPUs != "" { + return notEnoughGPUs + } + if hasP2P := env.Features[FeatureP2P]; !hasP2P { + return "Requires P2P support" + } + return "" +} + +// IsExpectedFailure implements `Compatibility.IsExpectedFailure`. +func (*RequiresP2P) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error { + if err := (&RequiresMultiGPU{}).IsExpectedFailure(ctx, env, logs, exitCode); err == nil { + return nil + } + const wantLog = "Peer to Peer access is not available amongst GPUs in the system, waiving test" + if strings.Contains(logs, wantLog) { + return nil + } + return fmt.Errorf("exit code %d and logs %q, expected EXIT_WAIVED (%d) or log message %q", exitCode, logs, exitCodeWaived, wantLog) +} + // requiresFeatures implements `Compatibility` for tests that require // specific features. type requiresFeatures struct { @@ -294,7 +324,13 @@ func (r *requiresFeatures) WillFail(ctx context.Context, env *TestEnvironment) s } // IsExpectedFailure implements `Compatibility.IsExpectedFailure`. -func (*requiresFeatures) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error { +func (r *requiresFeatures) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error { + if slices.Contains(r.features, FeatureGL) && !env.Features[FeatureGL] && strings.Contains(logs, `code=999(cudaErrorUnknown) "cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)"`) { + // Some GL-requiring tests such as `5_Domain_Specific/postProcessGL` + // and `5_Domain_Specific/fluidsGL` will incorrectly detect that GL + // is supported, and fail with this error message rather than waiving. + return nil + } if exitCode != exitCodeWaived { return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived) } @@ -396,7 +432,9 @@ func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironm // getContainerOpts returns the container run options to run CUDA tests. func getContainerOpts() (dockerutil.RunOpts, error) { - opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{}) + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ + Capabilities: dockerutil.AllGPUCapabilities, + }) if err != nil { return dockerutil.RunOpts{}, fmt.Errorf("failed to get GPU run options: %w", err) } @@ -444,8 +482,27 @@ func GetEnvironment(ctx context.Context, t *testing.T) (*TestEnvironment, error) } if runtimeIsGVisor { testLog(t, "Runtime is detected as gVisor") + runtimeArgs, err := dockerutil.RuntimeArgs() + if err != nil { + t.Fatalf("Failed to get runtime arguments: %v", err) + } + foundNVCaps := "" + const nvCapsPrefixFlag = "--nvproxy-allowed-driver-capabilities" + for i, arg := range runtimeArgs { + if strings.HasPrefix(arg, nvCapsPrefixFlag+"=") { + foundNVCaps = strings.TrimPrefix(arg, nvCapsPrefixFlag+"=") + } else if arg == "--nvproxy-allowed-driver-capabilities" && i < len(runtimeArgs)-1 { + foundNVCaps = runtimeArgs[i+1] + } + } + if foundNVCaps == "" { + return nil, fmt.Errorf("did not find --nvproxy-allowed-driver-capabilities=all flag in gVisor runtime arguments, please specify it for this test") + } + if foundNVCaps != "all" { + return nil, fmt.Errorf("found --nvproxy-allowed-driver-capabilities=%q flag in gVisor runtime arguments, please specify --nvproxy-allowed-driver-capabilities=all for this test", foundNVCaps) + } } else { - testLog(t, "Runtime is detected as not gVisor") + testLog(t, "Runtime is detected as non-gVisor") } featuresContainer := dockerutil.MakeContainer(ctx, t) defer featuresContainer.CleanUp(ctx) @@ -463,6 +520,10 @@ func GetEnvironment(ctx context.Context, t *testing.T) (*TestEnvironment, error) if line == "" { continue } + if strings.HasPrefix(line, "//") { + testLog(t, "/list_features.sh: %s", line) + continue + } featureAvailable := false var feature Feature if strings.HasPrefix(line, "PRESENT: ") { @@ -719,8 +780,13 @@ func TestCUDA(t *testing.T) { // for some reason (e.g. out of GPU memory). // To address this, the test first runs every test in parallel. Then, if // any of them failed, it will run only the failed ones serially. - numContainers := getDesiredTestParallelism() - testLog(t, "Number of cores is %d, spawning %.1f CUDA containers for each (%d containers total)...", runtime.NumCPU(), *containersPerCPU, numContainers) + numParallel := getDesiredTestParallelism() + numContainers := min(numParallel, max(numTests, 1)) + if numContainers == numParallel { + testLog(t, "Number of cores is %d, spawning %.1f CUDA containers for each (%d containers total)...", runtime.NumCPU(), *containersPerCPU, numContainers) + } else { + testLog(t, "%d tests to run, spawning %d CUDA containers...", numTests, numContainers) + } spawnGroup, spawnCtx := errgroup.WithContext(ctx) containers := make([]*dockerutil.Container, numContainers) for i := 0; i < numContainers; i++ { @@ -825,7 +891,7 @@ func TestCUDA(t *testing.T) { ) } } else if poolUtilization := cp.Utilization(); poolUtilization < 0.6 { - testLog(t, "WARNING: Pool utilization was only %.1f%%.", poolUtilization*100.0) + testLog(t, "WARNING: Container pool utilization was only %.1f%% during the test.", poolUtilization*100.0) testLog(t, "This test can be made faster and more efficient with proper test categorization,") testLog(t, "by identifying flaky tests and exclusive-requiring tests.") testLog(t, "Consider going over the logs to identify such tests and categorize them accordingly.") diff --git a/test/gpu/smoke_test.go b/test/gpu/smoke_test.go index d4137c39db..ffa7a66f9c 100644 --- a/test/gpu/smoke_test.go +++ b/test/gpu/smoke_test.go @@ -27,7 +27,9 @@ func TestGPUHello(t *testing.T) { c := dockerutil.MakeContainer(ctx, t) defer c.CleanUp(ctx) - opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{}) + opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{ + Capabilities: dockerutil.AllGPUCapabilities, + }) if err != nil { t.Fatalf("failed to get GPU run options: %v", err) }