Skip to content

Commit

Permalink
Merge pull request #127 from gpu-mode/ngc92/run_cuda_script_iterface
Browse files Browse the repository at this point in the history
Updated function signatures
  • Loading branch information
msaroufim authored Jan 13, 2025
2 parents 7be70c0 + 0c04a1b commit d61bd0b
Show file tree
Hide file tree
Showing 3 changed files with 243 additions and 103 deletions.
51 changes: 35 additions & 16 deletions scripts/ci_test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,9 +19,15 @@ def test_does_not_compile():
output_t custom_kernel(input_tt data) { }
"""

cout, score = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert score == 0
assert "CUDA compilation failed" in cout
comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert comp.success is False
assert run.success is False
assert comp.nvcc_found is True
assert comp.stdout == ""
assert 'train.cuh(2): error: identifier "input_tt" is undefined' in comp.stderr
assert '1 error detected in the compilation of "eval.cu".' in comp.stderr
assert comp.command.startswith("/usr/local/cuda/bin/nvcc")
assert "nvcc: NVIDIA (R) Cuda compiler driver" in comp.nvcc_version


def test_cuda_runtime_error():
Expand All @@ -44,11 +50,15 @@ def test_cuda_runtime_error():
}
"""
cout, score = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert score == 0
assert "Command '['./eval.out']' returned non-zero exit status 3." in cout
assert "cudaDeviceSynchronize() at eval.cu(64) in `measure_runtime`" in cout
assert "an illegal memory access was encountered" in cout
comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert comp.success is True
assert run.success is False
assert run.command == "./eval.out"
assert "warming up..." in run.stdout
assert "cudaDeviceSynchronize() at eval.cu(64) in `measure_runtime`" in run.stderr
assert "an illegal memory access was encountered" in run.stderr
assert run.exit_code == 3
assert len(run.result) == 0


def test_cuda_validation_fail():
Expand All @@ -68,14 +78,23 @@ def test_cuda_validation_fail():
}
"""
cout, score = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert score == 0
assert "Command '['./eval.out']' returned non-zero exit status 1." in cout
assert "ERROR AT 0, 0" in cout
comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert comp.success is True
assert run.success is False
assert run.command == "./eval.out"
# we never reach the benchmark part, because the test fails
assert "warming up..." not in run.stdout
assert "ERROR AT 0, 0" in run.stderr
assert run.exit_code == 1
assert run.result["check"] == "fail"


def test_cuda_correct():
sub = Path("examples/identity_cuda/submission.cuh")

cout, score = run_cuda_script(cu_eval, ref.read_text(), sub.read_text(), arch=None)
assert score > 0
sub = Path("examples/identity_cuda/submission.cuh").read_text()

comp, run = run_cuda_script(cu_eval, ref.read_text(), sub, arch=None)
assert comp.success is True
assert run.success is True
assert "warming up..." in run.stdout
assert run.exit_code == 0
assert run.result["check"] == "pass"
45 changes: 44 additions & 1 deletion src/discord-cluster-manager/modal_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,12 +100,55 @@ def modal_run_cuda_script( # # noqa: C901
"""Modal version of run_cuda_script, handling timeouts"""
try:
with timeout(timeout_seconds):
return run_cuda_script(
compile_result, run_result = run_cuda_script(
script_content,
reference_content=reference_content,
submission_content=submission_content,
arch=arch,
include_dirs=MODAL_CUDA_INCLUDE_DIRS,
)

if not compile_result.success:
if not compile_result.nvcc_found:
return (
"Error executing script: NVCC not found:\n"
+ f"command `{compile_result.command}` "
+ f"failed with exit code {compile_result.exit_code}:\n"
+ compile_result.stderr,
0.0,
)
return (
"Error executing script: CUDA compilation failed with return code "
+ f"{compile_result.exit_code}:\n{compile_result.stderr}\n"
+ f"compile command: `{compile_result.command}`",
0.0,
)

if not run_result.success:
# exit code 1 encodes failed tests
if run_result.exit_code == 1:
return f"check_implementation failed:\n{run_result.stderr}", 0.0
else:
return (
f"Script failed with exit code "
f"({run_result.exit_code}):\n{run_result.stderr}",
0.0,
)

print("run process stdout:", run_result.stdout)
print("run process stderr:", run_result.stderr)

score = float(run_result.result.get("duration.mean", "0.0")) / 1e9
passed = run_result.result.get("check", "") == "pass"
if not passed:
return "check_implementation failed", 0.0

if score is None:
return run_result.stdout, run_result.duration

return run_result.stdout, score

except TimeoutException as e:
return f"Timeout Error: {str(e)}", 0.0
except Exception as e:
return f"Error executing script: {str(e)}", 0.0
Loading

0 comments on commit d61bd0b

Please sign in to comment.