Skip to content

Commit

Permalink
Update CUDA test compatibility to keep up with added gVisor support.
Browse files Browse the repository at this point in the history
These CUDA tests were initially broken in gVisor but now appear to pass.

The test now also verifies that all capabilities are enabled when running.

PiperOrigin-RevId: 711880073
  • Loading branch information
EtiennePerot authored and gvisor-bot committed Jan 7, 2025
1 parent b94ab73 commit d9b451f
Show file tree
Hide file tree
Showing 9 changed files with 154 additions and 51 deletions.
9 changes: 4 additions & 5 deletions .buildkite/pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -197,17 +197,16 @@ steps:
- make cos-gpu-all-tests
agents:
queue: cos-canary-gpu
- <<: *source_test_continuous
label: ":fish: CUDA tests"
- label: ":fish: CUDA tests (NOSUBMIT)"
# This is its own test rather than being part of the GPU tests,
# because it takes around 30 minutes to run.
parallelism: 8
timeout_in_minutes: 60
parallelism: 32
timeout_in_minutes: 120
retry:
<<: *retry_settings
commands:
- make sudo TARGETS=//tools/gpu:main ARGS="install --latest" || cat /var/log/nvidia-installer.log
- make cuda-tests
- make cuda-tests ARGS="--cuda_verify_compatibility=true"
agents:
queue: gpu
- <<: *common
Expand Down
25 changes: 19 additions & 6 deletions .buildkite/release.yaml
Original file line number Diff line number Diff line change
@@ -1,15 +1,16 @@
agents:
queue: release
_templates:
retry_settings: &retry_settings
automatic:
- exit_status: -1
limit: 10
- exit_status: "*"
limit: 2
common: &common
timeout_in_minutes: 180
retry:
automatic:
- exit_status: -1
limit: 10
- exit_status: "*"
limit: 2

<<: *retry_settings
notify:
- email: "[email protected]"
if: build.state == "failed"
Expand Down Expand Up @@ -75,6 +76,18 @@ steps:
- make gpu-all-tests
agents:
queue: gpu
- label: ":fish: CUDA tests"
# This is its own test rather than being part of the GPU tests,
# because it takes around 30 minutes to run.
parallelism: 32
timeout_in_minutes: 120
retry:
<<: *retry_settings
commands:
- make sudo TARGETS=//tools/gpu:main ARGS="install --latest" || cat /var/log/nvidia-installer.log
- make cuda-tests ARGS="--cuda_verify_compatibility=true"
agents:
queue: gpu
- <<: *common
label: ":screwdriver: All GPU Drivers Test"
parallelism: 8
Expand Down
4 changes: 2 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -331,8 +331,8 @@ cos-gpu-all-tests: gpu-images cos-gpu-smoke-tests $(RUNTIME_BIN)
@$(call sudo,test/gpu:sniffer_test,--runtime=$(RUNTIME) -test.v --cos-gpu $(ARGS))
.PHONY: cos-gpu-all-tests

cuda-tests: load-gpu_cuda-tests $(RUNTIME_BIN)
@$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true)
cuda-tests: load-basic_alpine load-gpu_cuda-tests $(RUNTIME_BIN)
@$(call install_runtime,$(RUNTIME),--nvproxy=true --nvproxy-docker=true --nvproxy-allowed-driver-capabilities=all)
@$(call sudo,test/gpu:cuda_test,--runtime=$(RUNTIME) -test.v $(ARGS))
.PHONY: cuda-tests

Expand Down
11 changes: 11 additions & 0 deletions images/gpu/cuda-tests/list_features.cu
Original file line number Diff line number Diff line change
Expand Up @@ -49,4 +49,15 @@ int main(int argc, char *argv[]) {
CU_DEVICE_ATTRIBUTE_GENERIC_COMPRESSION_SUPPORTED,
cuda_device));
printFeature("COMPRESSIBLE_MEMORY", isCompressionAvailable != 0);
bool p2pAvailable = false;
int gpuCount = -1;
CHECK_CUDA(cudaGetDeviceCount(&gpuCount));
if (gpuCount >= 2) {
int canAccessAToB = -1;
CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessAToB, 0, 1));
int canAccessBToA = -1;
CHECK_CUDA(cudaDeviceCanAccessPeer(&canAccessBToA, 1, 0));
p2pAvailable = canAccessAToB > 0 && canAccessBToA > 0;
}
printFeature("P2P", p2pAvailable);
}
7 changes: 7 additions & 0 deletions images/gpu/cuda-tests/list_features.sh
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,13 @@

set -euo pipefail

if [[ "${NVIDIA_DRIVER_CAPABILITIES:-}" != "all" ]]; then
echo "NVIDIA_DRIVER_CAPABILITIES is not set to 'all'." >&2
echo "It is set to: '${NVIDIA_DRIVER_CAPABILITIES:-}'" >&2
echo "Please set it to 'all' and try again." >&2
exit 1
fi

cd /
nvcc list_features.cu -lcuda -o list_features
./list_features
Expand Down
28 changes: 18 additions & 10 deletions images/gpu/cuda-tests/run_sample.go
Original file line number Diff line number Diff line change
Expand Up @@ -997,6 +997,9 @@ func (st *SampleTest) RunLibNVVMTest(ctx context.Context) error {
// Main is the main method of this program.
func Main(ctx context.Context) (int, error) {
flag.Parse()
if nvCaps := os.Getenv("NVIDIA_DRIVER_CAPABILITIES"); nvCaps != "all" {
return 1, fmt.Errorf("NVIDIA_DRIVER_CAPABILITIES is not set to 'all' (got %q); please set it to 'all' and try again", nvCaps)
}
cleanupCtx, cleanupCancel := context.WithTimeout(ctx, *timeoutFlag)
defer cleanupCancel()
deadline, _ := cleanupCtx.Deadline()
Expand All @@ -1007,15 +1010,15 @@ func Main(ctx context.Context) (int, error) {
defer x.Shutdown(cleanupCtx)
testsCtx, testsCancel := context.WithDeadline(cleanupCtx, deadline.Add(-10*time.Second))
defer testsCancel()
failed := false
numTests := 0
exitCode := 1
var lastErr error
for _, testName := range flag.Args() {
numTests++
st, err := NewSampleTest(testName, x)
if err != nil {
log("> Invalid test %q: %s", testName, err)
failed = true
lastErr = fmt.Errorf("invalid test %q: %w", testName, err)
continue
}
log("> Running test: %s", testName)
Expand All @@ -1024,7 +1027,7 @@ func Main(ctx context.Context) (int, error) {
testCancel()
if err != nil {
log("> Test failed: %s (%s)", testName, err)
failed = true
lastErr = fmt.Errorf("test %q failed: %w", testName, err)
if exitErr := (*exec.ExitError)(nil); errors.As(err, &exitErr) && exitErr.ExitCode() > 0 {
exitCode = exitErr.ExitCode()
}
Expand All @@ -1035,14 +1038,19 @@ func Main(ctx context.Context) (int, error) {
if numTests == 0 {
return 1, fmt.Errorf("no tests to run, failing vacuously; specify test names as positional arguments")
}
if failed {
if numTests == 1 {
// If there was a single test to run, pass along its error code.
return exitCode, fmt.Errorf("test failed")
}
return 1, errors.New("one or more tests failed")
if lastErr == nil {
return 0, nil
}
if numTests != 1 {
return 1, fmt.Errorf("one or more tests failed (last error: %w)", lastErr)
}
// If there was a single test to run, pass along its error code if it
// had one. (It may not have had one in case the test failed for another
// reason, e.g. error setting up the test prior to running it.)
if exitCode == 0 {
exitCode = 1
}
return 0, nil
return exitCode, fmt.Errorf("test failed: %w", lastErr)
}

func main() {
Expand Down
3 changes: 2 additions & 1 deletion pkg/test/dockerutil/container.go
Original file line number Diff line number Diff line change
Expand Up @@ -1000,7 +1000,8 @@ func (cp *ContainerPool) String() string {
sb.WriteString(", ")
}
status := cp.statuses[container]
sb.WriteString(container.Name)
sb.WriteString("#")
sb.WriteString(strconv.Itoa(i))
sb.WriteString("[")
sb.WriteString(status.state.String())
sb.WriteString("]")
Expand Down
Loading

0 comments on commit d9b451f

Please sign in to comment.