Skip to content

Commit

Permalink
factored out run_cuda_eval, so it can be used independently of modal (#…
Browse files Browse the repository at this point in the history
…120)

* factored out run_cuda_eval, so it can be used independently of modal

* refactoring based on discord discussion

* use the test script also for a simple github test
  • Loading branch information
ngc92 authored Jan 11, 2025
1 parent 47d9b21 commit 8dc35d6
Show file tree
Hide file tree
Showing 5 changed files with 234 additions and 163 deletions.
28 changes: 28 additions & 0 deletions .github/workflows/cuda_test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
name: CUDA Test

on:
push:
branches: [ main ]
pull_request:
branches: [ main ]

jobs:
check-identity:
runs-on: [gpumode-nvidia-arc]
timeout-minutes: 10
container:
image: nvidia/cuda:12.4.0-devel-ubuntu22.04
steps:
- uses: actions/checkout@v3

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Run script
shell: bash
run: python3 scripts/local-test.py

env:
CUDA_VISIBLE_DEVICES: 0
15 changes: 15 additions & 0 deletions scripts/local-test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import sys
from pathlib import Path

sys.path.append("src/discord-cluster-manager")

from leaderboard_eval import cu_eval
from run_eval import run_cuda_script

ref = Path("examples/identity_cuda/reference.cuh")
sub = Path("examples/identity_cuda/submission.cuh")

cout, score = run_cuda_script(cu_eval, ref.read_text(), sub.read_text(), arch=None)
print(cout)
print(score)
exit(0 if score > 0 else 1)
171 changes: 17 additions & 154 deletions src/discord-cluster-manager/modal_runner.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import signal
import subprocess
from contextlib import contextmanager
from typing import Optional

from consts import CUDA_FLAGS, MODAL_CUDA_INCLUDE_DIRS, MODAL_PATH
from consts import MODAL_CUDA_INCLUDE_DIRS, MODAL_PATH
from modal import App, Image, Mount
from run_eval import run_cuda_script, run_pytorch_script

# Create a stub for the Modal app
# IMPORTANT: This has to stay in separate file or modal breaks
Expand Down Expand Up @@ -69,180 +69,43 @@ def timeout_handler(signum, frame):
signal.signal(signal.SIGALRM, original_handler)


def run_pytorch_script( # noqa: C901
def modal_run_pytorch_script( # noqa: C901
script_content: str,
reference_content: Optional[str] = None,
submission_content: Optional[str] = None,
timeout_seconds: int = 300,
arch: int = None,
) -> tuple[str, float]:
"""
Executes the provided PyTorch GPU kernel in an isolated environment with a timeout
Args:
script_content: The PyTorch script containing the GPU kernel to benchmark
reference_content: The (optional) reference code, used for leaderboards.
submission_content: The (optional) submission code, used for leaderboards.
timeout_seconds: Maximum execution time before timeout (default: 300 seconds)
arch: The arch code for the compute/sm versions.
Returns:
tuple[str, float]: (Kernel output, execution time in milliseconds)
NOTE: Modal execution time is not programmatically accessible, so we manually calculate it
"""

import os
import time

"""Modal version of run_pytorch_script, handling timeouts"""
try:
with timeout(timeout_seconds):
# Write submission files to directory
if reference_content is not None:
with open("reference.py", "w") as f:
f.write(reference_content)

if submission_content is not None:
with open("train.py", "w") as f:
f.write(submission_content)

with open("eval.py", "w") as f:
f.write(script_content)

execution_start_time = time.perf_counter()
result = subprocess.run(
["python", "eval.py"],
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
timeout=timeout_seconds,
run_pytorch_script(
script_content=script_content,
reference_content=reference_content,
submission_content=submission_content,
arch=arch,
)

if result.returncode != 0:
raise RuntimeError(
"Script execution failed with return code "
+ f"{result.returncode}:\n{result.stderr}"
)

score = None
for line in result.stdout.splitlines():
if line.startswith("score:"):
score = float(line.split(":")[1].strip())
return ("score", score)

if score is None:
execution_end_time = time.perf_counter()
score = execution_end_time - execution_start_time

return result.stdout, score

except TimeoutException as e:
return f"Timeout Error: {str(e)}", 0.0
except Exception as e:
return f"Error executing script: {str(e)}", 0.0
finally:
tmp_files = ["eval.py", "reference.py", "train.py"]
for f in tmp_files:
if os.path.exists(f):
os.remove(f)


def run_cuda_script( # # noqa: C901
def modal_run_cuda_script( # # noqa: C901
script_content: str,
reference_content: str = None,
submission_content: str = None,
timeout_seconds: int = 600,
arch: int = None,
) -> tuple[str, float]:
"""
Executes the provided CUDA kernel in an isolated environment with a timeout
Args:
script_content: The CUDA script containing the GPU kernel
reference_content: The (optional) reference code, used for leaderboards.
submission_content: The (optional) submission code, used for leaderboards.
timeout_seconds: Maximum execution time in seconds (default: 600 seconds)
arch: The arch code for the compute/sm versions.
Returns:
tuple[str, float]: (Kernel output, execution time in milliseconds)
NOTE: Modal execution time is not programmatically accessible, so we manually calculate it
"""
import os
import subprocess
import time

"""Modal version of run_cuda_script, handling timeouts"""
try:
with timeout(timeout_seconds):
# Check CUDA is available and installed correctly
print("[CUDA Env Check]")
try:
# these check cuda compiler is also available
subprocess.run(["nvcc", "--version"], check=True)
subprocess.run(["which", "nvcc"], check=True)
except Exception:
return "nvcc not found.", 0.0

ARCH = f"-gencode=arch=compute_{arch},code=sm_{arch}"
NVCC_FILES = "eval.cu"
# Write submission files to directory
if reference_content is not None:
with open("reference.cuh", "w") as f:
f.write(reference_content)

if submission_content is not None:
with open("train.cuh", "w") as f:
f.write(submission_content)

with open("eval.cu", "w") as f:
f.write(script_content)

execution_start_time = time.perf_counter()
compile_process = subprocess.run(
["nvcc"]
+ CUDA_FLAGS
+ MODAL_CUDA_INCLUDE_DIRS
+ [ARCH, NVCC_FILES, "-o", "eval.out"],
capture_output=True,
text=True,
run_cuda_script(
script_content,
reference_content=reference_content,
submission_content=submission_content,
arch=arch,
include_dirs=MODAL_CUDA_INCLUDE_DIRS,
)

if compile_process.returncode != 0:
raise RuntimeError(
"CUDA compilation failed with return code "
+ f"{compile_process.returncode}:\n{compile_process.stderr}"
)

run_process = subprocess.run(["./eval.out"], capture_output=True, text=True)
execution_end_time = time.perf_counter()

print("run process stdout", run_process.stdout)

score = None
for line in run_process.stdout.splitlines():
if line.startswith("score:"):
score = float(line.split(":")[1].strip())
break

if score is None:
execution_end_time = time.perf_counter()
score = execution_end_time - execution_start_time
return (
"check_implementation failed"
if "check_implementation failed" in run_process.stdout
else None,
score,
) # To make sure error is thrown on LB

return run_process.stdout, score

except TimeoutException as e:
return f"Timeout Error: {str(e)}", 0.0
except Exception as e:
return f"Error executing script: {str(e)}", 0.0
finally:
tmp_files = ["reference.cuh", "train.cuh", "eval.cu", "eval.out"]
for f in tmp_files:
if os.path.exists(f):
os.remove(f)
18 changes: 9 additions & 9 deletions src/discord-cluster-manager/modal_runner_archs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@


from consts import GPU_TO_SM
from modal_runner import app, cuda_image, run_cuda_script, run_pytorch_script
from modal_runner import app, cuda_image, modal_run_cuda_script, modal_run_pytorch_script


# T4: sm_70 (CUDA 7.x, Maxwell Architecture)
Expand All @@ -17,7 +17,7 @@ def run_cuda_script_t4(
submission_content: str = None,
timeout_seconds: int = 600,
) -> tuple[str, float]:
return run_cuda_script(
return modal_run_cuda_script(
script_content,
reference_content,
submission_content,
Expand All @@ -36,7 +36,7 @@ def run_pytorch_script_t4(
submission_content: str = None,
timeout_seconds: int = 600,
) -> tuple[str, float]:
return run_pytorch_script(
return modal_run_pytorch_script(
script_content,
reference_content,
submission_content,
Expand All @@ -56,7 +56,7 @@ def run_cuda_script_l4(
submission_content: str = None,
timeout_seconds: int = 600,
) -> tuple[str, float]:
return run_cuda_script(
return modal_run_cuda_script(
script_content,
reference_content,
submission_content,
Expand All @@ -75,7 +75,7 @@ def run_pytorch_script_l4(
submission_content: str = None,
timeout_seconds: int = 600,
) -> tuple[str, float]:
return run_pytorch_script(
return modal_run_cuda_script(
script_content,
reference_content,
submission_content,
Expand All @@ -95,7 +95,7 @@ def run_cuda_script_a100(
submission_content: str = None,
timeout_seconds: int = 600,
) -> tuple[str, float]:
return run_cuda_script(
return modal_run_cuda_script(
script_content,
reference_content,
submission_content,
Expand All @@ -114,7 +114,7 @@ def run_pytorch_script_a100(
submission_content: str = None,
timeout_seconds: int = 600,
) -> tuple[str, float]:
return run_pytorch_script(
return modal_run_cuda_script(
script_content,
reference_content,
submission_content,
Expand All @@ -134,7 +134,7 @@ def run_cuda_script_h100(
submission_content: str = None,
timeout_seconds: int = 600,
) -> tuple[str, float]:
return run_cuda_script(
return modal_run_cuda_script(
script_content,
reference_content,
submission_content,
Expand All @@ -153,7 +153,7 @@ def run_pytorch_script_h100(
submission_content: str = None,
timeout_seconds: int = 600,
) -> tuple[str, float]:
return run_pytorch_script(
return modal_run_cuda_script(
script_content,
reference_content,
submission_content,
Expand Down
Loading

0 comments on commit 8dc35d6

Please sign in to comment.