factored out run_cuda_eval, so it can be used independently of modal (#…

…120) * factored out run_cuda_eval, so it can be used independently of modal * refactoring based on discord discussion * use the test script also for a simple github test
gpu-mode · Jan 11, 2025 · 8dc35d6 · 8dc35d6
1 parent 47d9b21
commit 8dc35d6
Show file tree

Hide file tree

Showing 5 changed files with 234 additions and 163 deletions.
diff --git a/.github/workflows/cuda_test.yml b/.github/workflows/cuda_test.yml
@@ -0,0 +1,28 @@
+name: CUDA Test
+
+on:
+  push:
+    branches: [ main ]
+  pull_request:
+    branches: [ main ]
+
+jobs:
+  check-identity:
+    runs-on: [gpumode-nvidia-arc]
+    timeout-minutes: 10
+    container:
+      image: nvidia/cuda:12.4.0-devel-ubuntu22.04
+    steps:
+      - uses: actions/checkout@v3
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: '3.10'
+
+      - name: Run script
+        shell: bash
+        run: python3 scripts/local-test.py
+
+    env:
+      CUDA_VISIBLE_DEVICES: 0
diff --git a/scripts/local-test.py b/scripts/local-test.py
@@ -0,0 +1,15 @@
+import sys
+from pathlib import Path
+
+sys.path.append("src/discord-cluster-manager")
+
+from leaderboard_eval import cu_eval
+from run_eval import run_cuda_script
+
+ref = Path("examples/identity_cuda/reference.cuh")
+sub = Path("examples/identity_cuda/submission.cuh")
+
+cout, score = run_cuda_script(cu_eval, ref.read_text(), sub.read_text(), arch=None)
+print(cout)
+print(score)
+exit(0 if score > 0 else 1)
diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py
@@ -1,10 +1,10 @@
 import signal
-import subprocess
 from contextlib import contextmanager
 from typing import Optional
 
-from consts import CUDA_FLAGS, MODAL_CUDA_INCLUDE_DIRS, MODAL_PATH
+from consts import MODAL_CUDA_INCLUDE_DIRS, MODAL_PATH
 from modal import App, Image, Mount
+from run_eval import run_cuda_script, run_pytorch_script
 
 # Create a stub for the Modal app
 # IMPORTANT: This has to stay in separate file or modal breaks
@@ -69,180 +69,43 @@ def timeout_handler(signum, frame):
         signal.signal(signal.SIGALRM, original_handler)
 
 
-def run_pytorch_script(  # noqa: C901
+def modal_run_pytorch_script(  # noqa: C901
     script_content: str,
     reference_content: Optional[str] = None,
     submission_content: Optional[str] = None,
     timeout_seconds: int = 300,
     arch: int = None,
 ) -> tuple[str, float]:
-    """
-    Executes the provided PyTorch GPU kernel in an isolated environment with a timeout
-
-    Args:
-        script_content: The PyTorch script containing the GPU kernel to benchmark
-        reference_content: The (optional) reference code, used for leaderboards.
-        submission_content: The (optional) submission code, used for leaderboards.
-        timeout_seconds: Maximum execution time before timeout (default: 300 seconds)
-        arch: The arch code for the compute/sm versions.
-
-    Returns:
-        tuple[str, float]: (Kernel output, execution time in milliseconds)
-
-    NOTE: Modal execution time is not programmatically accessible, so we manually calculate it
-    """
-
-    import os
-    import time
-
+    """Modal version of run_pytorch_script, handling timeouts"""
     try:
         with timeout(timeout_seconds):
-            # Write submission files to directory
-            if reference_content is not None:
-                with open("reference.py", "w") as f:
-                    f.write(reference_content)
-
-            if submission_content is not None:
-                with open("train.py", "w") as f:
-                    f.write(submission_content)
-
-            with open("eval.py", "w") as f:
-                f.write(script_content)
-
-            execution_start_time = time.perf_counter()
-            result = subprocess.run(
-                ["python", "eval.py"],
-                stdout=subprocess.PIPE,
-                stderr=subprocess.PIPE,
-                text=True,
-                timeout=timeout_seconds,
+            run_pytorch_script(
+                script_content=script_content,
+                reference_content=reference_content,
+                submission_content=submission_content,
+                arch=arch,
             )
 
-            if result.returncode != 0:
-                raise RuntimeError(
-                    "Script execution failed with return code "
-                    + f"{result.returncode}:\n{result.stderr}"
-                )
-
-            score = None
-            for line in result.stdout.splitlines():
-                if line.startswith("score:"):
-                    score = float(line.split(":")[1].strip())
-                    return ("score", score)
-
-            if score is None:
-                execution_end_time = time.perf_counter()
-                score = execution_end_time - execution_start_time
-
-        return result.stdout, score
-
     except TimeoutException as e:
         return f"Timeout Error: {str(e)}", 0.0
-    except Exception as e:
-        return f"Error executing script: {str(e)}", 0.0
-    finally:
-        tmp_files = ["eval.py", "reference.py", "train.py"]
-        for f in tmp_files:
-            if os.path.exists(f):
-                os.remove(f)
 
 
-def run_cuda_script(  # # noqa: C901
+def modal_run_cuda_script(  # # noqa: C901
     script_content: str,
     reference_content: str = None,
     submission_content: str = None,
     timeout_seconds: int = 600,
     arch: int = None,
 ) -> tuple[str, float]:
-    """
-    Executes the provided CUDA kernel in an isolated environment with a timeout
-
-    Args:
-        script_content: The CUDA script containing the GPU kernel
-        reference_content: The (optional) reference code, used for leaderboards.
-        submission_content: The (optional) submission code, used for leaderboards.
-        timeout_seconds: Maximum execution time in seconds (default: 600 seconds)
-        arch: The arch code for the compute/sm versions.
-
-    Returns:
-        tuple[str, float]: (Kernel output, execution time in milliseconds)
-
-    NOTE: Modal execution time is not programmatically accessible, so we manually calculate it
-    """
-    import os
-    import subprocess
-    import time
-
+    """Modal version of run_cuda_script, handling timeouts"""
     try:
         with timeout(timeout_seconds):
-            # Check CUDA is available and installed correctly
-            print("[CUDA Env Check]")
-            try:
-                # these check cuda compiler is also available
-                subprocess.run(["nvcc", "--version"], check=True)
-                subprocess.run(["which", "nvcc"], check=True)
-            except Exception:
-                return "nvcc not found.", 0.0
-
-            ARCH = f"-gencode=arch=compute_{arch},code=sm_{arch}"
-            NVCC_FILES = "eval.cu"
-            # Write submission files to directory
-            if reference_content is not None:
-                with open("reference.cuh", "w") as f:
-                    f.write(reference_content)
-
-            if submission_content is not None:
-                with open("train.cuh", "w") as f:
-                    f.write(submission_content)
-
-            with open("eval.cu", "w") as f:
-                f.write(script_content)
-
-            execution_start_time = time.perf_counter()
-            compile_process = subprocess.run(
-                ["nvcc"]
-                + CUDA_FLAGS
-                + MODAL_CUDA_INCLUDE_DIRS
-                + [ARCH, NVCC_FILES, "-o", "eval.out"],
-                capture_output=True,
-                text=True,
+            run_cuda_script(
+                script_content,
+                reference_content=reference_content,
+                submission_content=submission_content,
+                arch=arch,
+                include_dirs=MODAL_CUDA_INCLUDE_DIRS,
             )
-
-            if compile_process.returncode != 0:
-                raise RuntimeError(
-                    "CUDA compilation failed with return code "
-                    + f"{compile_process.returncode}:\n{compile_process.stderr}"
-                )
-
-            run_process = subprocess.run(["./eval.out"], capture_output=True, text=True)
-            execution_end_time = time.perf_counter()
-
-            print("run process stdout", run_process.stdout)
-
-            score = None
-            for line in run_process.stdout.splitlines():
-                if line.startswith("score:"):
-                    score = float(line.split(":")[1].strip())
-                    break
-
-            if score is None:
-                execution_end_time = time.perf_counter()
-                score = execution_end_time - execution_start_time
-                return (
-                    "check_implementation failed"
-                    if "check_implementation failed" in run_process.stdout
-                    else None,
-                    score,
-                )  # To make sure error is thrown on LB
-
-            return run_process.stdout, score
-
     except TimeoutException as e:
         return f"Timeout Error: {str(e)}", 0.0
-    except Exception as e:
-        return f"Error executing script: {str(e)}", 0.0
-    finally:
-        tmp_files = ["reference.cuh", "train.cuh", "eval.cu", "eval.out"]
-        for f in tmp_files:
-            if os.path.exists(f):
-                os.remove(f)
diff --git a/src/discord-cluster-manager/modal_runner_archs.py b/src/discord-cluster-manager/modal_runner_archs.py
@@ -3,7 +3,7 @@
 
 
 from consts import GPU_TO_SM
-from modal_runner import app, cuda_image, run_cuda_script, run_pytorch_script
+from modal_runner import app, cuda_image, modal_run_cuda_script, modal_run_pytorch_script
 
 
 # T4: sm_70 (CUDA 7.x, Maxwell Architecture)
@@ -17,7 +17,7 @@ def run_cuda_script_t4(
     submission_content: str = None,
     timeout_seconds: int = 600,
 ) -> tuple[str, float]:
-    return run_cuda_script(
+    return modal_run_cuda_script(
         script_content,
         reference_content,
         submission_content,
@@ -36,7 +36,7 @@ def run_pytorch_script_t4(
     submission_content: str = None,
     timeout_seconds: int = 600,
 ) -> tuple[str, float]:
-    return run_pytorch_script(
+    return modal_run_pytorch_script(
         script_content,
         reference_content,
         submission_content,
@@ -56,7 +56,7 @@ def run_cuda_script_l4(
     submission_content: str = None,
     timeout_seconds: int = 600,
 ) -> tuple[str, float]:
-    return run_cuda_script(
+    return modal_run_cuda_script(
         script_content,
         reference_content,
         submission_content,
@@ -75,7 +75,7 @@ def run_pytorch_script_l4(
     submission_content: str = None,
     timeout_seconds: int = 600,
 ) -> tuple[str, float]:
-    return run_pytorch_script(
+    return modal_run_cuda_script(
         script_content,
         reference_content,
         submission_content,
@@ -95,7 +95,7 @@ def run_cuda_script_a100(
     submission_content: str = None,
     timeout_seconds: int = 600,
 ) -> tuple[str, float]:
-    return run_cuda_script(
+    return modal_run_cuda_script(
         script_content,
         reference_content,
         submission_content,
@@ -114,7 +114,7 @@ def run_pytorch_script_a100(
     submission_content: str = None,
     timeout_seconds: int = 600,
 ) -> tuple[str, float]:
-    return run_pytorch_script(
+    return modal_run_cuda_script(
         script_content,
         reference_content,
         submission_content,
@@ -134,7 +134,7 @@ def run_cuda_script_h100(
     submission_content: str = None,
     timeout_seconds: int = 600,
 ) -> tuple[str, float]:
-    return run_cuda_script(
+    return modal_run_cuda_script(
         script_content,
         reference_content,
         submission_content,
@@ -153,7 +153,7 @@ def run_pytorch_script_h100(
     submission_content: str = None,
     timeout_seconds: int = 600,
 ) -> tuple[str, float]:
-    return run_pytorch_script(
+    return modal_run_cuda_script(
         script_content,
         reference_content,
         submission_content,