Fix/concurrent-modal (#117)

* Fix: modal logging * Fix: modal concurrent * Fix: modal concurrent works * Fix: modal async cleanup * Refactor: remove unused * Feat: ci/cd modal deploy * Fix: typo in ci/cd * Fix: proper name for modal deploy * super minor: just replace followup with send_discord_message, it yielded a dumb bug --------- Co-authored-by: Alex Zhang <[email protected]>
gpu-mode · Jan 11, 2025 · 47d9b21 · 47d9b21
1 parent edbe2fb
commit 47d9b21
Show file tree

Hide file tree

Showing 3 changed files with 104 additions and 143 deletions.
diff --git a/.github/workflows/modal-deploy.yml b/.github/workflows/modal-deploy.yml
@@ -0,0 +1,32 @@
+name: CI/CD
+
+on:
+  push:
+    branches:
+      - main
+
+jobs:
+  deploy:
+    name: Modal Deployment
+    runs-on: ubuntu-latest
+    env:
+      MODAL_TOKEN_ID: ${{ secrets.MODAL_TOKEN_ID }}
+      MODAL_TOKEN_SECRET: ${{ secrets.MODAL_TOKEN_SECRET }}
+
+    steps:
+      - name: Checkout Repository
+        uses: actions/checkout@v4
+
+      - name: Install Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+
+      - name: Install Modal
+        run: |
+          python -m pip install --upgrade pip
+          pip install modal
+
+      - name: Deploy job
+        run: |
+          modal deploy src/discord-cluster-manager/modal_runner_archs.py
diff --git a/src/discord-cluster-manager/cogs/modal_cog.py b/src/discord-cluster-manager/cogs/modal_cog.py
@@ -1,4 +1,4 @@
-import time
+import asyncio
 from typing import Optional
 
 import discord
@@ -7,7 +7,6 @@
 from discord import app_commands
 from discord.ext import commands
 from leaderboard_eval import cu_eval, py_eval
-from modal_runner_archs import modal_context
 from utils import send_discord_message, send_logs, setup_logging
 
 logger = setup_logging()
@@ -32,113 +31,116 @@ async def run_modal(
         interaction: discord.Interaction,
         script: discord.Attachment,
         gpu_type: app_commands.Choice[str],
-        reference_script: discord.Attachment = None,
+        reference_script: Optional[discord.Attachment] = None,
         reference_code: str = None,
     ) -> discord.Thread:
         thread = None
+        status_msg = None
         try:
             if not script.filename.endswith((".py", ".cu", ".cuh", ".cpp")):
                 await send_discord_message(
-                    "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file"
+                    interaction,
+                    "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file",
+                    ephemeral=True,
                 )
                 return None
 
-            thread = await self.bot.create_thread(interaction, gpu_type.name, "Modal Job")
-            queue_start_time = time.perf_counter()
-
-            await thread.send(f"**Processing `{script.filename}` with {gpu_type.name}...**")
+            # TODO: Maybe find a better way?
+            if not interaction.response.is_done():
+                await interaction.response.defer(ephemeral=True)
+            channel = interaction.channel
+            message = await channel.send(f"Starting Modal job with {gpu_type.name}...")
+            thread = await message.create_thread(name=f"{gpu_type.name} Modal Job")
 
             script_content = (await script.read()).decode("utf-8")
             status_msg = await thread.send(
                 "**Running on Modal...**\n> ⏳ Waiting for available GPU..."
             )
 
-            script_content = (await script.read()).decode("utf-8")
             filename = "train.py" if script.filename.endswith(".py") else "train.cu"
-
+            reference_content = None
             if reference_script is not None or reference_code is not None:
                 reference_content = (
                     reference_code
                     if reference_code is not None
                     else (await reference_script.read()).decode("utf-8")
                 )
-                result, score = await self.trigger_modal_run(
-                    script_content,
-                    filename,
-                    gpu_type.value,
-                    reference_content,
-                )
-            else:
-                result, score = await self.trigger_modal_run(
-                    script_content, filename, gpu_type.value
-                )
-                queue_end_time = time.perf_counter()
-                queue_time = queue_end_time - queue_start_time
-
-                # Send metrics and results
-                await thread.send(f"\n**Script size:** {len(script_content)} bytes")
-                await thread.send(f"**Queue time:** {queue_time:.3f} s")
-                await thread.send(f"**Execution time:** {score:.3f} s\n")
-                await thread.send(f"**Modal execution result:**\n```\n{result}\n```")
-
-            if "check_implementation failed" in result:
-                await thread.send("Modal run failed.\n")
-                await thread.send("check_implementation failed.\n")
-                await send_logs(thread, result)
-                await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
-                return thread
-            elif "Error" in result:
-                await thread.send("Modal run failed.\n")
-                await send_logs(thread, result)
-                await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
-                return thread
 
-            if result is not None:
-                await thread.send(f"**score:{score:.9f}**\n```")
+            result, score = await self.handle_modal_execution(
+                interaction,
+                thread,
+                script_content,
+                filename,
+                gpu_type.value,
+                reference_content,
+                status_msg,
+            )
 
-            # Update status message to show completion
-            await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!")
+            if result is not None and score > 0:
+                await thread.send(f"**score:{score:.9f}**")
 
             return thread
 
         except Exception as e:
             logger.error(f"Error processing request: {str(e)}", exc_info=True)
-            if thread:
-                # Update status message to show error
+            if thread and status_msg:
                 await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
                 await thread.send(f"**Error:** {str(e)}")
             raise
 
-    async def trigger_modal_run(
+    async def handle_modal_execution(
         self,
+        interaction: discord.Interaction,
+        thread: discord.Thread,
         script_content: str,
         filename: str,
         gpu_type: str,
-        reference_content: Optional[str] = None,
+        reference_content: Optional[str],
+        status_msg: discord.Message,
     ) -> tuple[str, float]:
-        logger.info("Attempting to trigger Modal run")
+        try:
+            loop = asyncio.get_event_loop()
+            func_type = "pytorch" if filename.endswith(".py") else "cuda"
+            func_name = f"run_{func_type}_script_{gpu_type.lower()}"
+
+            if reference_content is not None:
+                result, score = await loop.run_in_executor(
+                    None,
+                    lambda: modal.Function.lookup("discord-bot-runner", func_name).remote(
+                        py_eval if filename.endswith(".py") else cu_eval,
+                        reference_content=reference_content,
+                        submission_content=script_content,
+                    ),
+                )
+            else:
+                result, score = await loop.run_in_executor(
+                    None,
+                    lambda: modal.Function.lookup("discord-bot-runner", func_name).remote(
+                        script_content,
+                    ),
+                )
+                await send_discord_message(
+                    interaction, f"Modal job completed in thread {thread.jump_url}", ephemeral=True
+                )
 
-        from modal_runner import app
+            # Send results
+            await thread.send(f"\n**Script size:** {len(script_content)} bytes")
+            await thread.send(f"**Execution time:** {score:.3f} s\n")
 
-        try:
-            print(f"Running {filename} with Modal")
-            file_type = filename.split(".")[-1]
-            with modal.enable_output():
-                with app.run(), modal_context() as runners:
-                    if reference_content is not None:
-                        eval_code = py_eval if file_type == "py" else cu_eval
-                        runner = runners.get_runner(file_type, gpu_type)
-                        stdout, score = runner.remote(
-                            eval_code,
-                            reference_content=reference_content,
-                            submission_content=script_content,
-                        )
-                    else:
-                        runner = runners.get_runner(file_type, gpu_type)
-                        stdout, score = runner.remote(script_content)
-
-            return stdout, score
+            if "check_implementation failed" in result or "Error" in result:
+                await thread.send("Modal run failed.\n")
+                await send_logs(thread, result)
+                await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
+                return result, 0
+
+            if result is not None:
+                await thread.send(f"**score:{score:.9f}**\n```")
+
+            await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!")
+            return result, score
 
         except Exception as e:
-            logger.error(f"Error in trigger_modal_run: {str(e)}", exc_info=True)
-            return f"Error: {str(e)}", 0
+            logger.error(f"Error in handle_modal_execution: {str(e)}", exc_info=True)
+            await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
+            await thread.send(f"**Error:** {str(e)}")
+            raise
diff --git a/src/discord-cluster-manager/modal_runner_archs.py b/src/discord-cluster-manager/modal_runner_archs.py
@@ -1,8 +1,6 @@
 # This file contains wrapper functions for running
 # Modal apps on specific devices. We will fix this later.
 
-import sys
-from contextlib import contextmanager
 
 from consts import GPU_TO_SM
 from modal_runner import app, cuda_image, run_cuda_script, run_pytorch_script
@@ -162,74 +160,3 @@ def run_pytorch_script_h100(
         timeout_seconds,
         arch=GPU_TO_SM["H100"],
     )
-
-
-def _get_runner_module_functions(prefix: str):
-    current_module = sys.modules[__name__]
-    return {
-        name.split("_")[-1]: getattr(current_module, name)
-        for name in dir(current_module)
-        if name.startswith(f"run_{prefix}_script_")
-    }
-
-
-pytorch_function_map = _get_runner_module_functions("pytorch")
-cuda_function_map = _get_runner_module_functions("cuda")
-
-
-@contextmanager
-def modal_context():
-    """
-    Context manager that ensures Modal functions are hydrated while in use.
-    Usage:
-        with hydrated_modal_runners() as runners:
-            function = runners.get_runner("py", "t4")
-            stdout, score = function(*args, **kwargs)
-    """
-    current_module = sys.modules[__name__]
-
-    # Dynamically get all runner functions
-    pytorch_functions = {
-        name.split("_")[-1]: getattr(current_module, name)
-        for name in dir(current_module)
-        if name.startswith("run_pytorch_script_")
-    }
-
-    cuda_functions = {
-        name.split("_")[-1]: getattr(current_module, name)
-        for name in dir(current_module)
-        if name.startswith("run_cuda_script_")
-    }
-
-    class Runners:
-        def __init__(self):
-            self._pytorch_map = pytorch_functions
-            self._cuda_map = cuda_functions
-
-        def get_runner(self, runner_type: str, gpu_type: str):
-            if runner_type == "py":
-                function = self._pytorch_map.get(gpu_type.lower())
-            elif runner_type == "cu":
-                function = self._cuda_map.get(gpu_type.lower())
-            else:
-                raise ValueError(f"Invalid runner type: {runner_type}")
-            return function
-
-        def _get_cuda_runner(self, gpu_type: str):
-            function = self._cuda_map.get(gpu_type.lower())
-            if function:
-                return function
-            raise ValueError(f"Function for gpu_type {gpu_type} not found")
-
-        def _get_pytorch_runner(self, gpu_type: str):
-            function = self._pytorch_map.get(gpu_type.lower())
-            if function:
-                return function
-            raise ValueError(f"Function for gpu_type {gpu_type} not found")
-
-    runners = Runners()
-    try:
-        yield runners
-    finally:
-        # Clean up if needed
-        pass