Skip to content

Commit

Permalink
Merge branch 'main' into ngc92/aux-files
Browse files Browse the repository at this point in the history
  • Loading branch information
alexzhang13 authored Jan 15, 2025
2 parents 48b7a5a + 7c233d3 commit 6a30fb7
Show file tree
Hide file tree
Showing 15 changed files with 501 additions and 436 deletions.
82 changes: 20 additions & 62 deletions .github/workflows/amd_workflow.yml
Original file line number Diff line number Diff line change
@@ -1,102 +1,60 @@
name: AMD PyTorch Job

on:
workflow_dispatch:
inputs:
script_content:
description: 'Content of Python script'
required: true
type: string
filename:
description: 'Name of Python script'
payload:
description: 'Content of the user submission, as json string'
required: true
type: string
reference_content:
description: 'Content of the reference code script (optional)'
required: false
type: string
reference_filename:
description: 'Name of reference script (supports .py or .cu)'
required: false
type: string
eval_content:
description: 'Content of the outer eval code script (optional)'
required: false
type: string
eval_filename:
description: 'Name of outer eval script (supports .py or .cu)'
requirements:
description: 'Contents for a requirements.txt file'
required: false
type: string

jobs:
train:
run:
runs-on: [amdgpu-mi250-x86-64]
timeout-minutes: 10
env:
VENV_DIR: /groups/aig_sharks/pytorch_venv
steps:
- uses: actions/checkout@v3
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Create script
shell: python
run: |
with open('${{ github.event.inputs.filename }}', 'w') as f:
f.write('''${{ github.event.inputs.script_content }}''')
- name: Create reference scripts if provided
shell: bash
run: |
if [[ -n "${{ github.event.inputs.reference_filename }}" ]]; then
echo "Creating reference script..."
cat > "${{ github.event.inputs.reference_filename }}" <<EOL
${{ github.event.inputs.reference_content }}
EOL
cat "${{ github.event.inputs.reference_filename }}" # Debug: Show file contents
else
echo "No reference content provided."
fi

- name: Create eval scripts if provided
- name: Create input files
shell: bash
run: |
if [[ -n "${{ github.event.inputs.eval_filename }}" ]]; then
echo "Creating reference script..."
cat > "${{ github.event.inputs.eval_filename }}" <<EOL
${{ github.event.inputs.eval_content }}
cat > "payload.json" <<'EOL'
${{ github.event.inputs.payload }}
EOL
cat "${{ github.event.inputs.eval_filename }}" # Debug: Show file contents
else
echo "No eval content provided."
fi
- name: Setup Virtual Environment and Install Dependencies
shell: bash
run: |
python -m venv ${VENV_DIR}
source ${VENV_DIR}/bin/activate
pip install --upgrade pip
pip install --pre pytorch-triton-rocm==3.1.0+cf34004b8a torch==2.6.0.dev20241023+rocm6.2 --index-url https://download.pytorch.org/whl/nightly/rocm6.2
if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
cat > "requirements.txt" <<'EOL'
${{ github.event.inputs.requirements }}
EOL
pip install -r "requirements.txt"
fi
- name: Run script
shell: bash
run: |
if [[ -n "${{ github.event.inputs.eval_content }}" ]]; then
echo "Running Python file..."
python3 "${{ github.event.inputs.eval_filename }}" > training.log 2>&1
cat training.log # Debug: show output
else
echo "Running Python file..."
python3 "${{ github.event.inputs.filename }}" > training.log 2>&1
cat training.log # Debug: show output
fi
python3 .github/workflows/runner.py
cat result.json # Debug: show output
- name: Upload training artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: training-artifacts
name: run-result
path: |
training.log
${{ github.event.inputs.filename }}
result.json
158 changes: 46 additions & 112 deletions .github/workflows/nvidia_workflow.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,134 +2,68 @@ name: NVIDIA PyTorch/CUDA Job
on:
workflow_dispatch:
inputs:
script_content:
description: 'Content of Python/CUDA script (.py or .cu file)'
payload:
description: 'Content of the user submission, as json string'
required: true
type: string
filename:
description: 'Name of script (supports .py or .cu)'
required: true
type: string
reference_content:
description: 'Content of the reference code script (optional)'
required: false
type: string
reference_filename:
description: 'Name of reference script (supports .py or .cu)'
required: false
type: string
eval_content:
description: 'Content of the outer eval code script (optional)'
required: false
type: string
eval_filename:
description: 'Name of outer eval script (supports .py or .cu)'
requirements:
description: 'Contents for a requirements.txt file'
required: false
type: string

jobs:
train:
run:
runs-on: [gpumode-nvidia-arc]
timeout-minutes: 10
container:
image: nvidia/cuda:12.4.0-devel-ubuntu22.04
steps:
- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Install uv
uses: astral-sh/setup-uv@v3
with:
version: "latest"

- name: Setup Python environment
run: |
uv venv .venv
echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
echo "$PWD/.venv/bin" >> $GITHUB_PATH
- name: Create script file
shell: bash
run: |
cat << 'EOL' > ${{ github.event.inputs.filename }}
${{ github.event.inputs.script_content }}
EOL
cat ${{ github.event.inputs.filename }} # Debug: show file contents
- uses: actions/checkout@v3

- name: Setup Python
uses: actions/setup-python@v5
with:
python-version: '3.10'

- name: Create input files
shell: bash
run: |
cat > "payload.json" <<'EOL'
${{ github.event.inputs.payload }}
EOL
- name: Create reference scripts if provided
shell: bash
run: |
if [[ -n "${{ github.event.inputs.reference_filename }}" ]]; then
echo "Creating reference script..."
cat > "${{ github.event.inputs.reference_filename }}" <<EOL
${{ github.event.inputs.reference_content }}
EOL
cat "${{ github.event.inputs.reference_filename }}" # Debug: Show file contents
else
echo "No reference content provided."
fi
- name: Install uv
uses: astral-sh/setup-uv@v3
with:
version: "latest"

- name: Create eval scripts if provided
shell: bash
run: |
if [[ -n "${{ github.event.inputs.eval_filename }}" ]]; then
echo "Creating eval script..."
cat > "${{ github.event.inputs.eval_filename }}" <<EOL
${{ github.event.inputs.eval_content }}
EOL
cat "${{ github.event.inputs.eval_filename }}" # Debug: Show file contents
else
echo "No eval content provided."
fi
- name: Setup Python environment
shell: bash
run: |
uv venv .venv
echo "VIRTUAL_ENV=$PWD/.venv" >> $GITHUB_ENV
echo "$PWD/.venv/bin" >> $GITHUB_PATH
- name: Install dependencies
run: |
if grep -rE "(import torch|from torch)" "${{ github.event.inputs.filename }}"; then
echo "PyTorch detected, installing torch"
uv pip install numpy torch setuptools ninja
fi
if grep -rE "(import triton|from triton)" "${{ github.event.inputs.filename }}"; then
echo "Triton detected, installing triton"
uv pip install triton
fi
if [[ -n "${{ github.event.inputs.requirements }}" ]]; then
cat > "requirements.txt" <<'EOL'
${{ github.event.inputs.requirements }}
EOL
uv pip install -r "requirements.txt"
fi
- name: Run script
shell: bash
run: |
# Check if eval content exists without trying to evaluate it
if [ -f "${{ github.event.inputs.eval_filename }}" ]; then
if [[ "${{ github.event.inputs.eval_filename }}" == *.cu ]]; then
echo "Compiling and running CUDA files..."
CUDA_FILES="${{ github.event.inputs.eval_filename }}"
echo "Compiling: $CUDA_FILES"
nvcc $CUDA_FILES -o cuda_program
./cuda_program > training.log 2>&1
else
echo "Running Python file..."
python3 "${{ github.event.inputs.eval_filename }}" > training.log 2>&1
fi
else
if [[ "${{ github.event.inputs.filename }}" == *.cu ]]; then
echo "Compiling and running CUDA file..."
nvcc "${{ github.event.inputs.filename }}" -o cuda_program
./cuda_program > training.log 2>&1
else
echo "Running Python file..."
python3 "${{ github.event.inputs.filename }}" > training.log 2>&1
fi
fi
cat training.log # Debug: show output
- name: Run script
shell: bash
run: |
python .github/workflows/runner.py
cat result.json # Debug: show output
- name: Upload training artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: training-artifacts
path: |
training.log
${{ github.event.inputs.filename }}
- name: Upload training artifacts
uses: actions/upload-artifact@v4
if: always()
with:
name: run-result
path: |
result.json
env:
CUDA_VISIBLE_DEVICES: 0
31 changes: 31 additions & 0 deletions .github/workflows/runner.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
import json
import sys
from dataclasses import asdict
from pathlib import Path

sys.path.append("src/discord-cluster-manager")

from leaderboard_eval import cu_eval, py_eval
from run_eval import run_cuda_script, run_pytorch_script

config = json.loads(Path("payload.json").read_text()) # type: dict
Path("payload.json").unlink()

if config["lang"] == "cu":
comp, run = run_cuda_script(
config.get("eval.cu", cu_eval),
config.get("reference.cuh", None),
config.get("submission.cuh", None),
arch=None,
)
result = {"compile": asdict(comp), "run": asdict(run)}
else:
run = run_pytorch_script(
config.get("eval.py", py_eval),
config.get("reference.py", None),
config.get("submission.py", None),
arch=None,
)
result = {"run": asdict(run)}

Path("result.json").write_text(json.dumps(result))
2 changes: 1 addition & 1 deletion examples/identity_py/reference.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,6 @@ def generate_input() -> List[torch.Tensor]:


if __name__ == "__main__":
inputs = generate_input(seed=42)
inputs = generate_input()
for idx, tensor in enumerate(inputs):
print(f"Input Tensor {idx + 1} (Shape: {tensor.shape}):\n{tensor}")
3 changes: 2 additions & 1 deletion scripts/ci_test_cuda.py
Original file line number Diff line number Diff line change
Expand Up @@ -88,7 +88,8 @@ def test_cuda_validation_fail():
{"eval.cu": cu_eval}, {"reference.cuh": ref.read_text(), "train.cuh": sub}, arch=None
)
assert comp.success is True
assert run.success is False
assert run.success is True
assert run.passed is False
assert run.command == "./eval.out"
# we never reach the benchmark part, because the test fails
assert "warming up..." not in run.stdout
Expand Down
6 changes: 4 additions & 2 deletions scripts/ci_test_python.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,10 +35,12 @@ def test_error():
def custom_kernel(input):
return [torch.zeros_like(i) for i in input]
"""

run = run_pytorch_script(
{"eval.py": py_eval, "reference.py": ref.read_text(), "train.py": sub}, "eval.py"
{"eval.py": py_eval, "reference.py": ref.read_text(), "train.py": sub}, "eval.py", arch=None
)
assert run.success is False
assert run.success is True
assert run.passed is False
assert run.command == "python eval.py"
# we never reach the benchmark part, because the test fails
assert "warming up..." not in run.stdout
Expand Down
Loading

0 comments on commit 6a30fb7

Please sign in to comment.