Skip to content

Commit

Permalink
unified runner code
Browse files Browse the repository at this point in the history
  • Loading branch information
ngc92 committed Jan 16, 2025
1 parent 598ea6d commit 64e37e5
Show file tree
Hide file tree
Showing 8 changed files with 118 additions and 272 deletions.
23 changes: 3 additions & 20 deletions .github/workflows/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,28 +5,11 @@

sys.path.append("src/discord-cluster-manager")

from leaderboard_eval import cu_eval, py_eval
from run_eval import run_cuda_script, run_pytorch_script
from run_eval import run_config

config = json.loads(Path("payload.json").read_text()) # type: dict
config = json.loads(Path("payload.json").read_text())
Path("payload.json").unlink()

if config["lang"] == "cu":
comp, run = run_cuda_script(
{"eval.cu": cu_eval},
{key: config[key] for key in ["reference.cuh", "submission.cuh"] if key in config},
arch=None,
)
result = {"compile": asdict(comp), "run": asdict(run)}
else:
run = run_pytorch_script(
{
"eval.py": py_eval,
**{key: config[key] for key in ["reference.py", "submission.py"] if key in config},
},
main="eval.py",
arch=None,
)
result = {"run": asdict(run)}
result = asdict(run_config(config))

Path("result.json").write_text(json.dumps(result))
15 changes: 7 additions & 8 deletions src/discord-cluster-manager/cogs/github_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
from leaderboard_eval import amd_requirements, nvidia_requirements
from report import generate_report
from run_eval import CompileResult, FullResult, RunResult
from utils import get_github_branch_name, send_discord_message, setup_logging
from utils import build_task_config, get_github_branch_name, send_discord_message, setup_logging

logger = setup_logging()

Expand Down Expand Up @@ -113,15 +113,14 @@ async def trigger_github_run(
# TODO implement HIP
raise ValueError("Cannot use CUDA runs with AMD GPUs")

eval_name = {"py": "eval.py", "cu": "eval.cu"}[lang]
ref_name = {"py": "reference.py", "cu": "reference.cuh"}[lang]
sub_name = {"py": "submission.py", "cu": "submission.cuh"}[lang]
lang_name = {"py": "Python", "cu": "CUDA"}[lang]

if reference_content is None:
config = {eval_name: script_content, "lang": lang}
else:
config = {ref_name: reference_content, sub_name: script_content, "lang": lang}
config = build_task_config(
lang=lang,
reference_content=reference_content,
submission_content=script_content,
arch=None,
)

logger.info(f"Attempting to trigger GitHub action for {lang_name} on {gpu_type.name}")
gh = Github(GITHUB_TOKEN)
Expand Down
53 changes: 19 additions & 34 deletions src/discord-cluster-manager/cogs/modal_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,13 +3,12 @@

import discord
import modal
from consts import ModalGPU
from consts import GPU_TO_SM, ModalGPU
from discord import app_commands
from discord.ext import commands
from leaderboard_eval import cu_eval, py_eval
from report import generate_report
from run_eval import FullResult
from utils import send_discord_message, setup_logging
from utils import build_task_config, send_discord_message, setup_logging

logger = setup_logging()

Expand Down Expand Up @@ -99,41 +98,27 @@ async def handle_modal_execution(
try:
loop = asyncio.get_event_loop()
func_type = "pytorch" if filename.endswith(".py") else "cuda"
lang = "py" if filename.endswith(".py") else "cu"
func_name = f"run_{func_type}_script_{gpu_type.lower()}"

if reference_content is not None:
result = await loop.run_in_executor(
None,
lambda: modal.Function.lookup("discord-bot-runner", func_name).remote(
py_eval if filename.endswith(".py") else cu_eval,
reference_content=reference_content,
submission_content=script_content,
),
)

# Send results
await thread.send(f"\n**Script size:** {len(script_content)} bytes")
await generate_report(thread, result)
return result

else:
# Currently broken?
result = await loop.run_in_executor(
None,
lambda: modal.Function.lookup("discord-bot-runner", func_name).remote(
script_content,
),
)
await send_discord_message(
interaction, f"Modal job completed in thread {thread.jump_url}", ephemeral=True
)
config = build_task_config(
lang=lang,
reference_content=reference_content,
submission_content=script_content,
arch=GPU_TO_SM[gpu_type.upper()],
)

# Send results
await thread.send(f"\n**Script size:** {len(script_content)} bytes")
await thread.send(f"**Execution time:** {result.run.duration:.3f} s\n")
result = await loop.run_in_executor(
None,
lambda: modal.Function.lookup("discord-bot-runner", func_name).remote(
config=config
),
)

await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!")
return result
# Send results
await thread.send(f"\n**Script size:** {len(script_content)} bytes")
await generate_report(thread, result)
return result

except Exception as e:
logger.error(f"Error in handle_modal_execution: {str(e)}", exc_info=True)
Expand Down
8 changes: 4 additions & 4 deletions src/discord-cluster-manager/consts.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,10 +56,10 @@ def combine_enums(enums: list[Type[Enum]], combined_name: str) -> Enum:


GPU_TO_SM = {
"T4": 75,
"L4": 80,
"A100": 80,
"H100": 90,
"T4": "75",
"L4": "80",
"A100": "80",
"H100": "90a",
}


Expand Down
53 changes: 8 additions & 45 deletions src/discord-cluster-manager/modal_runner.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import signal
import traceback
from contextlib import contextmanager
from typing import Optional

from consts import MODAL_CUDA_INCLUDE_DIRS, MODAL_PATH
from consts import MODAL_PATH
from modal import App, Image, Mount
from run_eval import FullResult, run_cuda_script, run_pytorch_script
from run_eval import FullResult, run_config

# Create a stub for the Modal app
# IMPORTANT: This has to stay in separate file or modal breaks
Expand Down Expand Up @@ -74,55 +74,18 @@ def timeout_handler(signum, frame):
signal.signal(signal.SIGALRM, original_handler)


def modal_run_pytorch_script( # noqa: C901
script_content: str,
reference_content: Optional[str] = None,
submission_content: Optional[str] = None,
def modal_run_config( # noqa: C901
config: dict,
timeout_seconds: int = 300,
arch: int = None,
) -> FullResult:
"""Modal version of run_pytorch_script, handling timeouts"""
try:
with timeout(timeout_seconds):
run_result = run_pytorch_script(
{
"eval.py": script_content,
"reference.py": reference_content,
"submission.py": submission_content,
},
"eval.py",
)
return FullResult(success=True, error="", compile=None, run=run_result)
# TODO fixup error handling!
return run_config(config)
except TimeoutException as e:
return FullResult(success=False, error=f"Timeout Error: {str(e)}", compile=None, run=None)
except Exception as e:
exception = "".join(traceback.format_exception(e))
return FullResult(
success=False, error=f"Error executing script: {str(e)}", compile=None, run=None
)


def modal_run_cuda_script( # # noqa: C901
script_content: str,
reference_content: str = None,
submission_content: str = None,
timeout_seconds: int = 600,
arch: int = None,
) -> FullResult:
"""Modal version of run_cuda_script, handling timeouts"""
try:
with timeout(timeout_seconds):
comp, run = run_cuda_script(
{"eval.cu": script_content},
{"reference.cuh": reference_content, "submission.cuh": submission_content},
arch=arch,
include_dirs=MODAL_CUDA_INCLUDE_DIRS,
)
return FullResult(success=True, error="", compile=comp, run=run)
# TODO fixup error handling!
except TimeoutException as e:
return FullResult(success=False, error=f"Timeout Error: {str(e)}", compile=None, run=None)
except Exception as e:
return FullResult(
success=False, error=f"Error executing script: {str(e)}", compile=None, run=None
success=False, error=f"Error executing script:\n{exception}", compile=None, run=None
)
165 changes: 8 additions & 157 deletions src/discord-cluster-manager/modal_runner_archs.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,162 +2,13 @@
# Modal apps on specific devices. We will fix this later.


from consts import GPU_TO_SM
from modal_runner import app, cuda_image, modal_run_cuda_script, modal_run_pytorch_script
from run_eval import FullResult
from modal_runner import app, cuda_image, modal_run_config, python_image


# T4: sm_70 (CUDA 7.x, Maxwell Architecture)
@app.function(
gpu="T4",
image=cuda_image,
)
def run_cuda_script_t4(
script_content: str,
reference_content: str = None,
submission_content: str = None,
timeout_seconds: int = 600,
) -> FullResult:
return modal_run_cuda_script(
script_content,
reference_content,
submission_content,
timeout_seconds,
arch=GPU_TO_SM["T4"],
)


@app.function(
gpu="T4",
image=cuda_image,
)
def run_pytorch_script_t4(
script_content: str,
reference_content: str = None,
submission_content: str = None,
timeout_seconds: int = 600,
) -> FullResult:
return modal_run_pytorch_script(
script_content,
reference_content,
submission_content,
timeout_seconds,
arch=GPU_TO_SM["T4"],
)


# L4: sm_80 (L4 Tensor Core architecture)
@app.function(
gpu="L4",
image=cuda_image,
)
def run_cuda_script_l4(
script_content: str,
reference_content: str = None,
submission_content: str = None,
timeout_seconds: int = 600,
) -> FullResult:
return modal_run_cuda_script(
script_content,
reference_content,
submission_content,
timeout_seconds,
arch=GPU_TO_SM["L4"],
)


@app.function(
gpu="L4",
image=cuda_image,
)
def run_pytorch_script_l4(
script_content: str,
reference_content: str = None,
submission_content: str = None,
timeout_seconds: int = 600,
) -> FullResult:
return modal_run_pytorch_script(
script_content,
reference_content,
submission_content,
timeout_seconds,
arch=GPU_TO_SM["L4"],
)


# A100: sm_80 (Ampere architecture)
@app.function(
gpu="A100",
image=cuda_image,
)
def run_cuda_script_a100(
script_content: str,
reference_content: str = None,
submission_content: str = None,
timeout_seconds: int = 600,
) -> FullResult:
return modal_run_cuda_script(
script_content,
reference_content,
submission_content,
timeout_seconds,
arch=GPU_TO_SM["A100"],
)


@app.function(
gpu="A100",
image=cuda_image,
)
def run_pytorch_script_a100(
script_content: str,
reference_content: str = None,
submission_content: str = None,
timeout_seconds: int = 600,
) -> FullResult:
return modal_run_pytorch_script(
script_content,
reference_content,
submission_content,
timeout_seconds,
arch=GPU_TO_SM["A100"],
)


# H100: sm_90 (Hopper architecture)
@app.function(
gpu="H100",
image=cuda_image,
)
def run_cuda_script_h100(
script_content: str,
reference_content: str = None,
submission_content: str = None,
timeout_seconds: int = 600,
) -> FullResult:
return modal_run_cuda_script(
script_content,
reference_content,
submission_content,
timeout_seconds,
arch=GPU_TO_SM["H100"],
)


@app.function(
gpu="H100",
image=cuda_image,
)
def run_pytorch_script_h100(
script_content: str,
reference_content: str = None,
submission_content: str = None,
timeout_seconds: int = 600,
) -> FullResult:
return modal_run_pytorch_script(
script_content,
reference_content,
submission_content,
timeout_seconds,
arch=GPU_TO_SM["H100"],
gpus = ["T4", "L4", "A100", "H100"]
for gpu in gpus:
app.function(gpu=gpu, image=cuda_image, name=f"run_cuda_script_{gpu.lower()}", serialized=True)(
modal_run_config
)
app.function(
gpu=gpu, image=python_image, name=f"run_pytorch_script_{gpu.lower()}", serialized=True
)(modal_run_config)
Loading

0 comments on commit 64e37e5

Please sign in to comment.