Skip to content

Commit

Permalink
Update CUDA test compatibility to keep up with added gVisor support.
Browse files Browse the repository at this point in the history
These CUDA tests were initially broken in gVisor but now appear to pass.

The test now also verifies that all capabilities are enabled when running.

PiperOrigin-RevId: 711880073
  • Loading branch information
EtiennePerot authored and gvisor-bot committed Jan 8, 2025
1 parent b94ab73 commit 6415633
Show file tree
Hide file tree
Showing 7 changed files with 138 additions and 40 deletions.
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -331,8 +331,8 @@ cos-gpu-all-tests: gpu-images cos-gpu-smoke-tests $(RUNTIME_BIN)
@$(call sudo,test/gpu:sniffer_test,--runtime=$(RUNTIME) -test.v --cos-gpu $(ARGS))
.PHONY: cos-gpu-all-tests

cuda-tests: load-gpu_cuda-tests $(RUNTIME_BIN)
@$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true)
cuda-tests: load-basic_alpine load-gpu_cuda-tests $(RUNTIME_BIN)
@$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true --nvproxy-allowed-driver-capabilities=all)
@$(call sudo,test/gpu:cuda_test,--runtime=$(RUNTIME) -test.v $(ARGS))
.PHONY: cuda-tests

Expand Down
14 changes: 14 additions & 0 deletions images/gpu/cuda-tests/list_features.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,18 @@ int main(int argc, char *argv[]) {
CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED,
cuda_device));
printFeature("COMPRESSIBLE_MEMORY", isCompressionAvailable != 0);
bool p2pAvailable = false;
int gpuCount = -1;
CHECK_CUDA(cudaGetDeviceCount(&gpuCount));
printf("// Number of GPUs: %d\n", gpuCount);
if (gpuCount >= 2) {
int canAccessAToB = -1;
CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessAToB, 0, 1));
printf("// CUDA P2P: 0 -> 1: %d\n", canAccessAToB);
int canAccessBToA = -1;
CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessBToA, 1, 0));
printf("// CUDA P2P: 1 -> 0: %d\n", canAccessBToA);
p2pAvailable = canAccessAToB > 0 && canAccessBToA > 0;
}
printFeature("P2P", p2pAvailable);
}
7 changes: 7 additions & 0 deletions images/gpu/cuda-tests/list_features.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@

set -euo pipefail

if [[ "${NVIDIA_DRIVER_CAPABILITIES:-}" != "all" ]]; then
echo "NVIDIA_DRIVER_CAPABILITIES is not set to 'all'." >&2
echo "It is set to: '${NVIDIA_DRIVER_CAPABILITIES:-}'" >&2
echo "Please set it to 'all' and try again." >&2
exit 1
fi

cd /
nvcc list_features.cu -lcuda -o list_features
./list_features
Expand Down
28 changes: 18 additions & 10 deletions images/gpu/cuda-tests/run_sample.go
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,9 @@ func (st *SampleTest) RunLibNVVMTest(ctx context.Context) error {
// Main is the main method of this program.
func Main(ctx context.Context) (int, error) {
flag.Parse()
if nvCaps := os.Getenv("NVIDIA_DRIVER_CAPABILITIES"); nvCaps != "all" {
return 1, fmt.Errorf("NVIDIA_DRIVER_CAPABILITIES is not set to 'all' (got %q); please set it to 'all' and try again", nvCaps)
}
cleanupCtx, cleanupCancel := context.WithTimeout(ctx, *timeoutFlag)
defer cleanupCancel()
deadline, _ := cleanupCtx.Deadline()
Expand All @@ -1007,15 +1010,15 @@ func Main(ctx context.Context) (int, error) {
defer x.Shutdown(cleanupCtx)
testsCtx, testsCancel := context.WithDeadline(cleanupCtx, deadline.Add(-10*time.Second))
defer testsCancel()
failed := false
numTests := 0
exitCode := 1
var lastErr error
for _, testName := range flag.Args() {
numTests++
st, err := NewSampleTest(testName, x)
if err != nil {
log("> Invalid test %q: %s", testName, err)
failed = true
lastErr = fmt.Errorf("invalid test %q: %w", testName, err)
continue
}
log("> Running test: %s", testName)
Expand All @@ -1024,7 +1027,7 @@ func Main(ctx context.Context) (int, error) {
testCancel()
if err != nil {
log("> Test failed: %s (%s)", testName, err)
failed = true
lastErr = fmt.Errorf("test %q failed: %w", testName, err)
if exitErr := (*exec.ExitError)(nil); errors.As(err, &exitErr) && exitErr.ExitCode() > 0 {
exitCode = exitErr.ExitCode()
}
Expand All @@ -1035,14 +1038,19 @@ func Main(ctx context.Context) (int, error) {
if numTests == 0 {
return 1, fmt.Errorf("no tests to run, failing vacuously; specify test names as positional arguments")
}
if failed {
if numTests == 1 {
// If there was a single test to run, pass along its error code.
return exitCode, fmt.Errorf("test failed")
}
return 1, errors.New("one or more tests failed")
if lastErr == nil {
return 0, nil
}
if numTests != 1 {
return 1, fmt.Errorf("one or more tests failed (last error: %w)", lastErr)
}
// If there was a single test to run, pass along its error code if it
// had one. (It may not have had one in case the test failed for another
// reason, e.g. error setting up the test prior to running it.)
if exitCode == 0 {
exitCode = 1
}
return 0, nil
return exitCode, fmt.Errorf("test failed: %w", lastErr)
}

func main() {
Expand Down
3 changes: 2 additions & 1 deletion pkg/test/dockerutil/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -1000,7 +1000,8 @@ func (cp *ContainerPool) String() string {
sb.WriteString(", ")
}
status := cp.statuses[container]
sb.WriteString(container.Name)
sb.WriteString("#")
sb.WriteString(strconv.Itoa(i))
sb.WriteString("[")
sb.WriteString(status.state.String())
sb.WriteString("]")
Expand Down
118 changes: 92 additions & 26 deletions test/gpu/cuda_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"math"
"os"
"runtime"
"slices"
"strconv"
"strings"
"sync"
Expand Down Expand Up @@ -73,9 +74,7 @@ var (
var testCompatibility = map[string]Compatibility{
"0_Introduction/simpleAttributes": RequiresFeatures(FeaturePersistentL2Caching),
"0_Introduction/simpleCUDA2GL": RequiresFeatures(FeatureGL),
"0_Introduction/simpleIPC": &BrokenInGVisor{OnlyWhenMultipleGPU: true},
"0_Introduction/simpleP2P": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"0_Introduction/vectorAddMMAP": &BrokenInGVisor{OnlyWhenMultipleGPU: true},
"0_Introduction/simpleP2P": &RequiresP2P{},
"2_Concepts_and_Techniques/cuHook": &BrokenEverywhere{
Reason: "Requires ancient version of glibc (<=2.33)",
},
Expand All @@ -90,12 +89,12 @@ var testCompatibility = map[string]Compatibility{
),
"2_Concepts_and_Techniques/EGLSync_CUDAEvent_Interop": &OnlyOnWindows{},
"2_Concepts_and_Techniques/streamOrderedAllocationIPC": &BrokenInGVisor{},
"2_Concepts_and_Techniques/streamOrderedAllocationP2P": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"2_Concepts_and_Techniques/streamOrderedAllocationP2P": &RequiresP2P{},
"3_CUDA_Features/bf16TensorCoreGemm": RequiresFeatures(FeatureTensorCores),
"3_CUDA_Features/cdpAdvancedQuicksort": RequiresFeatures(FeatureDynamicParallelism),
"3_CUDA_Features/cudaCompressibleMemory": RequiresFeatures(FeatureCompressibleMemory),
"3_CUDA_Features/dmmaTensorCoreGemm": RequiresFeatures(FeatureTensorCores),
"3_CUDA_Features/memMapIPCDrv": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"3_CUDA_Features/memMapIPCDrv": &RequiresMultiGPU{},
"3_CUDA_Features/tf32TensorCoreGemm": RequiresFeatures(FeatureTensorCores),
"4_CUDA_Libraries/conjugateGradientMultiDeviceCG": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"4_CUDA_Libraries/cudaNvSci": &RequiresNvSci{},
Expand All @@ -105,14 +104,14 @@ var testCompatibility = map[string]Compatibility{
"4_CUDA_Libraries/cuDLAStandaloneMode": &OnlyOnWindows{},
"4_CUDA_Libraries/cuDLALayerwiseStatsHybrid": &OnlyOnWindows{},
"4_CUDA_Libraries/cuDLALayerwiseStatsStandalone": &OnlyOnWindows{},
"4_CUDA_Libraries/simpleCUFFT_2d_MGPU": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"4_CUDA_Libraries/simpleCUFFT_MGPU": MultiCompatibility(&RequiresMultiGPU{}, &BrokenInGVisor{}),
"4_CUDA_Libraries/simpleCUFFT_2d_MGPU": &RequiresMultiGPU{},
"4_CUDA_Libraries/simpleCUFFT_MGPU": &RequiresMultiGPU{},
"5_Domain_Specific/fluidsD3D9": &OnlyOnWindows{},
"5_Domain_Specific/fluidsGL": RequiresFeatures(FeatureGL),
"5_Domain_Specific/fluidsGLES": &OnlyOnWindows{},
"5_Domain_Specific/nbody_opengles": &OnlyOnWindows{},
"5_Domain_Specific/nbody_screen": &OnlyOnWindows{},
"5_Domain_Specific/p2pBandwidthLatencyTest": &BrokenInGVisor{OnlyWhenMultipleGPU: true},
"5_Domain_Specific/p2pBandwidthLatencyTest": &RequiresP2P{},
"5_Domain_Specific/postProcessGL": RequiresFeatures(FeatureGL),
"5_Domain_Specific/simpleD3D10": &OnlyOnWindows{},
"5_Domain_Specific/simpleD3D10RenderTarget": &OnlyOnWindows{},
Expand All @@ -133,8 +132,11 @@ var testCompatibility = map[string]Compatibility{
}

// flakyTests is a list of tests that are flaky.
// These will be retried up to 3 times in parallel before running serially.
var flakyTests = map[string]struct{}{}
// These will be retried up to 3 times in parallel before running 3 times
// serially.
var flakyTests = map[string]struct{}{
"3_CUDA_Features/cdpAdvancedQuicksort": {},
}

// exclusiveTests is a list of tests that must run exclusively (i.e. with
// no other test running on the machine at the same time), or they will
Expand All @@ -145,6 +147,13 @@ var flakyTests = map[string]struct{}{}
// causing spurious failures for the tests that happen to be running in
// parallel with them.
var exclusiveTests = map[string]struct{}{
// Can fail due to
// "launch failed because launch would exceed cudaLimitDevRuntimePendingLaunchCount"
// when running in parallel with other tests.
"3_CUDA_Features/cdpAdvancedQuicksort": {},

// Performance-intensive tests that tend to make other concurrent tests
// flake due to their high resource usage.
"6_Performance/alignedTypes": {},
"6_Performance/transpose": {},
"6_Performance/UnifiedMemoryPerf": {},
Expand All @@ -153,12 +162,7 @@ var exclusiveTests = map[string]struct{}{
// alwaysSkippedTests don't run at all, ever, and are not verified when
// --cuda_verify_compatibility is set.
// Each test is mapped to a reason why it should be skipped.
var alwaysSkippedTests = map[string]string{
// These tests seem to flake in gVisor, but consistently within the same
// run of the overall test, so they cannot be included in `flakyTests`.
"0_Introduction/simpleAssert": "Flaky in gVisor",
"0_Introduction/simpleAssert_nvrtc": "Flaky in gVisor",
}
var alwaysSkippedTests = map[string]string{}

// Feature is a feature as listed by /list_features.sh.
type Feature string
Expand All @@ -170,6 +174,7 @@ const (
FeatureGL Feature = "GL"
FeatureTensorCores Feature = "TENSOR_CORES"
FeatureCompressibleMemory Feature = "COMPRESSIBLE_MEMORY"
FeatureP2P Feature = "P2P"
)

// allFeatures is a list of all CUDA features above.
Expand All @@ -179,6 +184,7 @@ var allFeatures = []Feature{
FeatureGL,
FeatureTensorCores,
FeatureCompressibleMemory,
FeatureP2P,
}

// TestEnvironment represents the environment in which a sample test runs.
Expand Down Expand Up @@ -228,10 +234,6 @@ type BrokenInGVisor struct {
// This is for tests that can run on a single or multiple GPUs alike,
// but specifically fail in gVisor when run with multiple GPUs.
OnlyWhenMultipleGPU bool

// KnownToHang may be set to true for short tests which can hang instead
// of failing. This avoids waiting ~forever for them to finish.
KnownToHang bool
}

// WillFail implements `Compatibility.WillFail`.
Expand Down Expand Up @@ -273,6 +275,34 @@ func (*RequiresMultiGPU) IsExpectedFailure(ctx context.Context, env *TestEnviron
return nil
}

// RequiresMultiGPU implements `Compatibility` for tests that require
// peer-to-peer communication between GPUs.
// Implies RequiresMultiGPU, so tests do not need to specify both.
type RequiresP2P struct{}

// WillFail implements `Compatibility.WillFail`.
func (*RequiresP2P) WillFail(ctx context.Context, env *TestEnvironment) string {
if notEnoughGPUs := (&RequiresMultiGPU{}).WillFail(ctx, env); notEnoughGPUs != "" {
return notEnoughGPUs
}
if hasP2P := env.Features[FeatureP2P]; !hasP2P {
return "Requires P2P support"
}
return ""
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (*RequiresP2P) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
if err := (&RequiresMultiGPU{}).IsExpectedFailure(ctx, env, logs, exitCode); err == nil {
return nil
}
const wantLog = "Peer to Peer access is not available amongst GPUs in the system, waiving test"
if strings.Contains(logs, wantLog) {
return nil
}
return fmt.Errorf("exit code %d and logs %q, expected EXIT_WAIVED (%d) or log message %q", exitCode, logs, exitCodeWaived, wantLog)
}

// requiresFeatures implements `Compatibility` for tests that require
// specific features.
type requiresFeatures struct {
Expand All @@ -294,7 +324,13 @@ func (r *requiresFeatures) WillFail(ctx context.Context, env *TestEnvironment) s
}

// IsExpectedFailure implements `Compatibility.IsExpectedFailure`.
func (*requiresFeatures) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
func (r *requiresFeatures) IsExpectedFailure(ctx context.Context, env *TestEnvironment, logs string, exitCode int) error {
if slices.Contains(r.features, FeatureGL) && !env.Features[FeatureGL] && strings.Contains(logs, `code=999(cudaErrorUnknown) "cudaGraphicsGLRegisterBuffer(&cuda_vbo_resource, vbo, cudaGraphicsMapFlagsNone)"`) {
// Some GL-requiring tests such as `5_Domain_Specific/postProcessGL`
// and `5_Domain_Specific/fluidsGL` will incorrectly detect that GL
// is supported, and fail with this error message rather than waiving.
return nil
}
if exitCode != exitCodeWaived {
return fmt.Errorf("exit code %d, expected EXIT_WAIVED (%d)", exitCode, exitCodeWaived)
}
Expand Down Expand Up @@ -396,7 +432,9 @@ func (*FullyCompatible) IsExpectedFailure(ctx context.Context, env *TestEnvironm

// getContainerOpts returns the container run options to run CUDA tests.
func getContainerOpts() (dockerutil.RunOpts, error) {
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
Capabilities: dockerutil.AllGPUCapabilities,
})
if err != nil {
return dockerutil.RunOpts{}, fmt.Errorf("failed to get GPU run options: %w", err)
}
Expand Down Expand Up @@ -444,8 +482,27 @@ func GetEnvironment(ctx context.Context, t *testing.T) (*TestEnvironment, error)
}
if runtimeIsGVisor {
testLog(t, "Runtime is detected as gVisor")
runtimeArgs, err := dockerutil.RuntimeArgs()
if err != nil {
t.Fatalf("Failed to get runtime arguments: %v", err)
}
foundNVCaps := ""
const nvCapsPrefixFlag = "--nvproxy-allowed-driver-capabilities"
for i, arg := range runtimeArgs {
if strings.HasPrefix(arg, nvCapsPrefixFlag+"=") {
foundNVCaps = strings.TrimPrefix(arg, nvCapsPrefixFlag+"=")
} else if arg == "--nvproxy-allowed-driver-capabilities" && i < len(runtimeArgs)-1 {
foundNVCaps = runtimeArgs[i+1]
}
}
if foundNVCaps == "" {
return nil, fmt.Errorf("did not find --nvproxy-allowed-driver-capabilities=all flag in gVisor runtime arguments, please specify it for this test")
}
if foundNVCaps != "all" {
return nil, fmt.Errorf("found --nvproxy-allowed-driver-capabilities=%q flag in gVisor runtime arguments, please specify --nvproxy-allowed-driver-capabilities=all for this test", foundNVCaps)
}
} else {
testLog(t, "Runtime is detected as not gVisor")
testLog(t, "Runtime is detected as non-gVisor")
}
featuresContainer := dockerutil.MakeContainer(ctx, t)
defer featuresContainer.CleanUp(ctx)
Expand All @@ -463,6 +520,10 @@ func GetEnvironment(ctx context.Context, t *testing.T) (*TestEnvironment, error)
if line == "" {
continue
}
if strings.HasPrefix(line, "//") {
testLog(t, "/list_features.sh: %s", line)
continue
}
featureAvailable := false
var feature Feature
if strings.HasPrefix(line, "PRESENT: ") {
Expand Down Expand Up @@ -719,8 +780,13 @@ func TestCUDA(t *testing.T) {
// for some reason (e.g. out of GPU memory).
// To address this, the test first runs every test in parallel. Then, if
// any of them failed, it will run only the failed ones serially.
numContainers := getDesiredTestParallelism()
testLog(t, "Number of cores is %d, spawning %.1f CUDA containers for each (%d containers total)...", runtime.NumCPU(), *containersPerCPU, numContainers)
numParallel := getDesiredTestParallelism()
numContainers := min(numParallel, max(numTests, 1))
if numContainers == numParallel {
testLog(t, "Number of cores is %d, spawning %.1f CUDA containers for each (%d containers total)...", runtime.NumCPU(), *containersPerCPU, numContainers)
} else {
testLog(t, "%d tests to run, spawning %d CUDA containers...", numTests, numContainers)
}
spawnGroup, spawnCtx := errgroup.WithContext(ctx)
containers := make([]*dockerutil.Container, numContainers)
for i := 0; i < numContainers; i++ {
Expand Down Expand Up @@ -825,7 +891,7 @@ func TestCUDA(t *testing.T) {
)
}
} else if poolUtilization := cp.Utilization(); poolUtilization < 0.6 {
testLog(t, "WARNING: Pool utilization was only %.1f%%.", poolUtilization*100.0)
testLog(t, "WARNING: Container pool utilization was only %.1f%% during the test.", poolUtilization*100.0)
testLog(t, "This test can be made faster and more efficient with proper test categorization,")
testLog(t, "by identifying flaky tests and exclusive-requiring tests.")
testLog(t, "Consider going over the logs to identify such tests and categorize them accordingly.")
Expand Down
4 changes: 3 additions & 1 deletion test/gpu/smoke_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ func TestGPUHello(t *testing.T) {
c := dockerutil.MakeContainer(ctx, t)
defer c.CleanUp(ctx)

opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{})
opts, err := dockerutil.GPURunOpts(dockerutil.SniffGPUOpts{
Capabilities: dockerutil.AllGPUCapabilities,
})
if err != nil {
t.Fatalf("failed to get GPU run options: %v", err)
}
Expand Down

0 comments on commit 6415633

Please sign in to comment.