Merge branch 'main' into ngc92/aux-files

gpu-mode · Jan 15, 2025 · 6a30fb7 · 6a30fb7
2 parents 48b7a5a + 7c233d3
commit 6a30fb7
Show file tree

Hide file tree

Showing 15 changed files with 501 additions and 436 deletions.
diff --git a/.github/workflows/amd_workflow.yml b/.github/workflows/amd_workflow.yml
@@ -1,102 +1,60 @@
 name: AMD PyTorch Job
-
 on:
   workflow_dispatch:
     inputs:
-      script_content:
-        description: 'Content of Python script'
-        required: true
-        type: string
-      filename:
-        description: 'Name of Python script'
+      payload:
+        description: 'Content of the user submission, as json string'
         required: true
         type: string
-      reference_content:
-        description: 'Content of the reference code script (optional)'
-        required: false
-        type: string
-      reference_filename:
-        description: 'Name of reference script (supports .py or .cu)'
-        required: false
-        type: string
-      eval_content:
-        description: 'Content of the outer eval code script (optional)'
-        required: false
-        type: string
-      eval_filename:
-        description: 'Name of outer eval script (supports .py or .cu)'
+      requirements:
+        description: 'Contents for a requirements.txt file'
         required: false
         type: string
 
 jobs:
-  train:
+  run:
     runs-on: [amdgpu-mi250-x86-64]
     timeout-minutes: 10
     env:
       VENV_DIR: /groups/aig_sharks/pytorch_venv
     steps:
+    - uses: actions/checkout@v3
     - name: Setup Python
       uses: actions/setup-python@v5
       with:
         python-version: '3.10'
 
-    - name: Create script
-      shell: python
-      run: |
-        with open('${{ github.event.inputs.filename }}', 'w') as f:
-            f.write('''${{ github.event.inputs.script_content }}''')
-
-    - name: Create reference scripts if provided
-      shell: bash
-      run: |
-        if [[ -n "${{ github.event.inputs.reference_filename }}" ]]; then
-          echo "Creating reference script..."
-          cat > "${{ github.event.inputs.reference_filename }}" <<EOL
-        ${{ github.event.inputs.reference_content }}
-        EOL
-            cat "${{ github.event.inputs.reference_filename }}"  # Debug: Show file contents
-          else
-              echo "No reference content provided."
-          fi
 
-    - name: Create eval scripts if provided
+    - name: Create input files
       shell: bash
       run: |
-        if [[ -n "${{ github.event.inputs.eval_filename }}" ]]; then
-          echo "Creating reference script..."
-          cat > "${{ github.event.inputs.eval_filename }}" <<EOL
-        ${{ github.event.inputs.eval_content }}
+        cat > "payload.json" <<'EOL'
+        ${{ github.event.inputs.payload }}
         EOL
-            cat "${{ github.event.inputs.eval_filename }}"  # Debug: Show file contents
-          else
-              echo "No eval content provided."
-          fi
 
     - name: Setup Virtual Environment and Install Dependencies
+      shell: bash
       run: |
         python -m venv ${VENV_DIR}
         source ${VENV_DIR}/bin/activate
         pip install --upgrade pip
-        pip install --pre pytorch-triton-rocm==3.1.0+cf34004b8a torch==2.6.0.dev20241023+rocm6.2 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
 
+        if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
+          cat > "requirements.txt" <<'EOL'
+          ${{ github.event.inputs.requirements }}
+        EOL
+        pip install -r "requirements.txt"
+        fi
     - name: Run script
       shell: bash
       run: |
-        if [[ -n "${{ github.event.inputs.eval_content }}" ]]; then
-          echo "Running Python file..."
-          python3 "${{ github.event.inputs.eval_filename }}" > training.log 2>&1
-          cat training.log  # Debug: show output
-        else
-          echo "Running Python file..."
-          python3 "${{ github.event.inputs.filename }}" > training.log 2>&1
-          cat training.log  # Debug: show output
-        fi
+        python3 .github/workflows/runner.py
+        cat result.json  # Debug: show output
 
     - name: Upload training artifacts
       uses: actions/upload-artifact@v4
       if: always()
       with:
-        name: training-artifacts
+        name: run-result
         path: |
-          training.log
-          ${{ github.event.inputs.filename }}
+          result.json
diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml
@@ -2,134 +2,68 @@ name: NVIDIA PyTorch/CUDA Job
 on:
   workflow_dispatch:
     inputs:
-      script_content:
-        description: 'Content of Python/CUDA script (.py or .cu file)'
+      payload:
+        description: 'Content of the user submission, as json string'
         required: true
         type: string
-      filename:
-        description: 'Name of script (supports .py or .cu)'
-        required: true
-        type: string
-      reference_content:
-        description: 'Content of the reference code script (optional)'
-        required: false
-        type: string
-      reference_filename:
-        description: 'Name of reference script (supports .py or .cu)'
-        required: false
-        type: string
-      eval_content:
-        description: 'Content of the outer eval code script (optional)'
-        required: false
-        type: string
-      eval_filename:
-        description: 'Name of outer eval script (supports .py or .cu)'
+      requirements:
+        description: 'Contents for a requirements.txt file'
         required: false
         type: string
 
 jobs:
-  train:
+  run:
     runs-on: [gpumode-nvidia-arc]
     timeout-minutes: 10
     container:
       image: nvidia/cuda:12.4.0-devel-ubuntu22.04
     steps:
-      - name: Setup Python
-        uses: actions/setup-python@v5
-        with:
-          python-version: '3.10'
-
-      - name: Install uv
-        uses: astral-sh/setup-uv@v3
-        with:
-          version: "latest"
-
-      - name: Setup Python environment
-        run: |
-          uv venv .venv
-          echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
-          echo "$PWD/.venv/bin" >> $GITHUB_PATH
-          
-      - name: Create script file
-        shell: bash
-        run: |
-          cat << 'EOL' > ${{ github.event.inputs.filename }}
-          ${{ github.event.inputs.script_content }}
-          EOL
-          cat ${{ github.event.inputs.filename }}  # Debug: show file contents
+    - uses: actions/checkout@v3
+
+    - name: Setup Python
+      uses: actions/setup-python@v5
+      with:
+        python-version: '3.10'
+
+    - name: Create input files
+      shell: bash
+      run: |
+        cat > "payload.json" <<'EOL'
+        ${{ github.event.inputs.payload }}
+        EOL
 
-      - name: Create reference scripts if provided
-        shell: bash
-        run: |
-          if [[ -n "${{ github.event.inputs.reference_filename }}" ]]; then
-            echo "Creating reference script..."
-            cat > "${{ github.event.inputs.reference_filename }}" <<EOL
-          ${{ github.event.inputs.reference_content }}
-          EOL
-              cat "${{ github.event.inputs.reference_filename }}"  # Debug: Show file contents
-            else
-                echo "No reference content provided."
-            fi
+    - name: Install uv
+      uses: astral-sh/setup-uv@v3
+      with:
+        version: "latest"
 
-      - name: Create eval scripts if provided
-        shell: bash
-        run: |
-          if [[ -n "${{ github.event.inputs.eval_filename }}" ]]; then
-            echo "Creating eval script..."
-            cat > "${{ github.event.inputs.eval_filename }}" <<EOL
-          ${{ github.event.inputs.eval_content }}
-          EOL
-              cat "${{ github.event.inputs.eval_filename }}"  # Debug: Show file contents
-            else
-                echo "No eval content provided."
-            fi
+    - name: Setup Python environment
+      shell: bash
+      run: |
+        uv venv .venv
+        echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
+        echo "$PWD/.venv/bin" >> $GITHUB_PATH
 
-      - name: Install dependencies
-        run: |
-          if grep -rE "(import torch|from torch)" "${{ github.event.inputs.filename }}"; then
-            echo "PyTorch detected, installing torch"
-            uv pip install numpy torch setuptools ninja
-          fi
-          if grep -rE "(import triton|from triton)" "${{ github.event.inputs.filename }}"; then
-            echo "Triton detected, installing triton"
-            uv pip install triton
-          fi
+        if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
+          cat > "requirements.txt" <<'EOL'
+          ${{ github.event.inputs.requirements }}
+        EOL
+        uv pip install -r "requirements.txt"
+        fi
 
-      - name: Run script
-        shell: bash
-        run: |
-          # Check if eval content exists without trying to evaluate it
-          if [ -f "${{ github.event.inputs.eval_filename }}" ]; then
-            if [[ "${{ github.event.inputs.eval_filename }}" == *.cu ]]; then
-              echo "Compiling and running CUDA files..."
-              CUDA_FILES="${{ github.event.inputs.eval_filename }}"
-              echo "Compiling: $CUDA_FILES"
-              nvcc $CUDA_FILES -o cuda_program
-              ./cuda_program > training.log 2>&1
-            else
-              echo "Running Python file..."
-              python3 "${{ github.event.inputs.eval_filename }}" > training.log 2>&1
-            fi
-          else
-            if [[ "${{ github.event.inputs.filename }}" == *.cu ]]; then
-              echo "Compiling and running CUDA file..."
-              nvcc "${{ github.event.inputs.filename }}" -o cuda_program
-              ./cuda_program > training.log 2>&1
-            else
-              echo "Running Python file..."
-              python3 "${{ github.event.inputs.filename }}" > training.log 2>&1
-            fi
-          fi
-          cat training.log  # Debug: show output
+    - name: Run script
+      shell: bash
+      run: |
+        python .github/workflows/runner.py
+        cat result.json  # Debug: show output
 
-      - name: Upload training artifacts
-        uses: actions/upload-artifact@v4
-        if: always()
-        with:
-          name: training-artifacts
-          path: |
-            training.log
-            ${{ github.event.inputs.filename }}
+    - name: Upload training artifacts
+      uses: actions/upload-artifact@v4
+      if: always()
+      with:
+        name: run-result
+        path: |
+          result.json
 
     env:
       CUDA_VISIBLE_DEVICES: 0
diff --git a/.github/workflows/runner.py b/.github/workflows/runner.py
@@ -0,0 +1,31 @@
+import json
+import sys
+from dataclasses import asdict
+from pathlib import Path
+
+sys.path.append("src/discord-cluster-manager")
+
+from leaderboard_eval import cu_eval, py_eval
+from run_eval import run_cuda_script, run_pytorch_script
+
+config = json.loads(Path("payload.json").read_text())  # type: dict
+Path("payload.json").unlink()
+
+if config["lang"] == "cu":
+    comp, run = run_cuda_script(
+        config.get("eval.cu", cu_eval),
+        config.get("reference.cuh", None),
+        config.get("submission.cuh", None),
+        arch=None,
+    )
+    result = {"compile": asdict(comp), "run": asdict(run)}
+else:
+    run = run_pytorch_script(
+        config.get("eval.py", py_eval),
+        config.get("reference.py", None),
+        config.get("submission.py", None),
+        arch=None,
+    )
+    result = {"run": asdict(run)}
+
+Path("result.json").write_text(json.dumps(result))
diff --git a/examples/identity_py/reference.py b/examples/identity_py/reference.py
@@ -41,6 +41,6 @@ def generate_input() -> List[torch.Tensor]:
 
 
 if __name__ == "__main__":
-    inputs = generate_input(seed=42)
+    inputs = generate_input()
     for idx, tensor in enumerate(inputs):
         print(f"Input Tensor {idx + 1} (Shape: {tensor.shape}):\n{tensor}")
diff --git a/scripts/ci_test_cuda.py b/scripts/ci_test_cuda.py
@@ -88,7 +88,8 @@ def test_cuda_validation_fail():
         {"eval.cu": cu_eval}, {"reference.cuh": ref.read_text(), "train.cuh": sub}, arch=None
     )
     assert comp.success is True
-    assert run.success is False
+    assert run.success is True
+    assert run.passed is False
     assert run.command == "./eval.out"
     # we never reach the benchmark part, because the test fails
     assert "warming up..." not in run.stdout

diff --git a/scripts/ci_test_python.py b/scripts/ci_test_python.py
@@ -35,10 +35,12 @@ def test_error():
 def custom_kernel(input):
     return [torch.zeros_like(i) for i in input]
         """
+
     run = run_pytorch_script(
-        {"eval.py": py_eval, "reference.py": ref.read_text(), "train.py": sub}, "eval.py"
+        {"eval.py": py_eval, "reference.py": ref.read_text(), "train.py": sub}, "eval.py", arch=None
     )
-    assert run.success is False
+    assert run.success is True
+    assert run.passed is False
     assert run.command == "python eval.py"
     # we never reach the benchmark part, because the test fails
     assert "warming up..." not in run.stdout