From 3205e2c64fa133d7ad93b6d87bf33781fb68058d Mon Sep 17 00:00:00 2001 From: Andre Date: Thu, 26 Dec 2024 00:29:33 -0700 Subject: [PATCH] pytorch profiler initial --- .../cogs/github_cog.py | 83 +++++++++++++++++++ src/discord-cluster-manager/cogs/modal_cog.py | 68 +++++++++++++++ .../cogs/verify_run_cog.py | 40 +++++++++ src/discord-cluster-manager/modal_runner.py | 55 ++++++++++++ 4 files changed, 246 insertions(+) diff --git a/src/discord-cluster-manager/cogs/github_cog.py b/src/discord-cluster-manager/cogs/github_cog.py index c194c10..85468c7 100644 --- a/src/discord-cluster-manager/cogs/github_cog.py +++ b/src/discord-cluster-manager/cogs/github_cog.py @@ -241,3 +241,86 @@ async def download_artifact(self, run_id): return "No training artifacts found" except Exception as e: return f"Error downloading artifacts: {str(e)}" + + @app_commands.command(name="profile", description="Profile using pytorch profiler on GitHub runners") + @app_commands.describe( + script="The PyTorch model to profile", + gpu_type="Choose the GPU type for GitHub Actions", + ) + @app_commands.choices( + gpu_type=[ + app_commands.Choice(name="NVIDIA", value="nvidia"), + app_commands.Choice(name="AMD", value="amd"), + ] + ) + async def profile_github( + self, + interaction: discord.Interaction, + script: discord.Attachment, + gpu_type: app_commands.Choice[str], + ) -> discord.Thread: + if not script.filename.endswith(".py"): + await send_discord_message( + interaction, + "Please provide a Python (.py) file with PyTorch code to profile using the PyTorch profiler.", + ephemeral=True + ) + return None + + thread = await self.bot.create_thread(interaction, f"{gpu_type.name}-profile", "GitHub Profiling") + message = f"Created thread {thread.mention} for your GitHub profiling job" + + await send_discord_message(interaction, message) + await thread.send(f"**Profiling `{script.filename}` with {gpu_type.name}...**") + + try: + script_content = (await script.read()).decode("utf-8") + selected_gpu = GPUType.AMD if gpu_type.value == "amd" else GPUType.NVIDIA + + profiler_script = """ +import torch +from torch.profiler import profile, record_function, ProfilerActivity + +with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.CUDA, + ], + record_shapes=True, + profile_memory=True, + with_stack=True +) as prof: + with record_function("model_inference"): +{} +print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) +""".format('\n'.join(' ' + line for line in script_content.splitlines())) + + run_id = await self.trigger_github_action( + profiler_script, + script.filename, + selected_gpu + ) + + if run_id: + await thread.send(f"GitHub Action triggered! Run ID: {run_id}\nMonitoring progress...") + status, logs, url = await self.check_workflow_status(run_id, thread) + + await thread.send(f"Profiling completed with status: {status}") + + if len(logs) > 1900: + await self.bot.send_chunked_message(thread, logs, code_block=True) + else: + await thread.send(f"```\nProfiler output:\n{logs}\n```") + + if url: + await thread.send(f"View the full run at: <{url}>") + else: + await thread.send("Failed to trigger GitHub Action. Please check the configuration.") + + return thread + + except Exception as e: + logger.error(f"Error processing profiling request: {str(e)}", exc_info=True) + if thread: + await thread.send(f"Error processing profiling request: {str(e)}") + raise diff --git a/src/discord-cluster-manager/cogs/modal_cog.py b/src/discord-cluster-manager/cogs/modal_cog.py index 2fc2d24..9c19835 100644 --- a/src/discord-cluster-manager/cogs/modal_cog.py +++ b/src/discord-cluster-manager/cogs/modal_cog.py @@ -6,6 +6,10 @@ from discord.ext import commands from utils import send_discord_message, setup_logging +# profiler imports +from modal_runner import modal_app +from modal_runner import run_profile_pytorch_script + logger = setup_logging() class ModalCog(commands.Cog): @@ -98,3 +102,67 @@ async def trigger_modal_run(self, script_content: str, filename: str) -> tuple[s except Exception as e: logger.error(f"Error in trigger_modal_run: {str(e)}", exc_info=True) return f"Error: {str(e)}", 0 + + @app_commands.command(name="profile", description="Profile using pytorch profiler") + @app_commands.describe( + script="The PyTorch model to profile", + gpu_type="Choose the GPU type for Modal" + ) + @app_commands.choices( + gpu_type=[ + app_commands.Choice(name="NVIDIA T4", value="t4"), + ] + ) + async def profile_modal( + self, + interaction: discord.Interaction, + script: discord.Attachment, + gpu_type: app_commands.Choice[str], + ) -> discord.Thread: + thread = None + try: + if not script.filename.endswith(".py"): + await send_discord_message( + interaction, + "Please provide a Python (.py) file with PyTorch code to profile using the PyTorch profiler.", + ephemeral=True + ) + return None + + thread = await self.bot.create_thread(interaction, f"{gpu_type.name}-{script.filename}-profile", "Modal Profiling") + message = f"Created thread {thread.mention} for your Modal profiling job" + + await send_discord_message(interaction, message) + await thread.send(f"**Profiling `{script.filename}` with {gpu_type.name}...**") + + script_content = (await script.read()).decode("utf-8") + status_msg = await thread.send("**Running profiler on Modal...**\n> ⏳ Waiting for available GPU...") + + result, execution_time_ms = await self.trigger_modal_profile(script_content) + + await status_msg.edit(content="**Running profiler on Modal...**\n> ✅ Profiling completed!") + await thread.send(f"**Profiling results:**\n```\n{result}\n```") + + return thread + + except Exception as e: + logger.error(f"Error processing profiling request: {str(e)}", exc_info=True) + if thread: + await status_msg.edit(content="**Running profiler on Modal...**\n> ❌ Profiling failed!") + await thread.send(f"**Error:** {str(e)}") + raise + + async def trigger_modal_profile(self, script_content: str) -> tuple[str, float]: + logger.info("Attempting to trigger Modal profiling run") + + try: + print("Running profiler with Modal") + with modal.enable_output(): + with modal_app.run(): + result, execution_time_ms = run_profile_pytorch_script.remote(script_content) + + return result, execution_time_ms + + except Exception as e: + logger.error(f"Error in trigger_modal_profile: {str(e)}", exc_info=True) + return f"Error: {str(e)}", 0 diff --git a/src/discord-cluster-manager/cogs/verify_run_cog.py b/src/discord-cluster-manager/cogs/verify_run_cog.py index b627128..84d310f 100644 --- a/src/discord-cluster-manager/cogs/verify_run_cog.py +++ b/src/discord-cluster-manager/cogs/verify_run_cog.py @@ -144,6 +144,46 @@ async def verify_modal_run( ) return False + async def verify_github_profile( + self, github_cog: GitHubCog, gpu_type: app_commands.Choice[str], interaction: discord.Interaction + ) -> bool: + github_thread = await github_cog.profile_github( + interaction, script_file, gpu_type + ) + + message_contents = [msg.content async for msg in github_thread.history(limit=None)] + + required_patterns = [ + "Profiling `.*` with", + "GitHub Action triggered!", + "Profiling completed with status:", + "Profiler output:", + ] + + all_patterns_found = all( + any(re.search(pattern, content, re.DOTALL) is not None for content in message_contents) + for pattern in required_patterns + ) + + if all_patterns_found: + await send_discord_message( + interaction, + f"✅ GitHub {gpu_type.name} profiling completed successfully - all expected messages found!", + ) + return True + else: + missing_patterns = [ + pattern + for pattern in required_patterns + if not any(re.search(pattern, content, re.DOTALL) for content in message_contents) + ] + await send_discord_message( + interaction, + f"❌ GitHub {gpu_type.name} profiling verification failed. Missing expected messages:\n" + + "\n".join(f"- {pattern}" for pattern in missing_patterns), + ) + return False + @app_commands.command(name="verifyruns") async def verify_runs(self, interaction: discord.Interaction): """Verify runs on on Modal, GitHub Nvidia, and GitHub AMD.""" diff --git a/src/discord-cluster-manager/modal_runner.py b/src/discord-cluster-manager/modal_runner.py index 16680bb..3d946a3 100644 --- a/src/discord-cluster-manager/modal_runner.py +++ b/src/discord-cluster-manager/modal_runner.py @@ -146,3 +146,58 @@ def run_cuda_script(script_content: str, timeout_seconds: int = 600) -> tuple[st if os.path.exists("script.out"): os.remove("script.out") sys.stdout = sys.__stdout__ + + +def run_profile_pytorch_script(script_content: str, timeout_seconds: int = 300) -> tuple[str, float]: + """ + Profiles the provided PyTorch script using torch.profiler + + Args: + script_content: The PyTorch script to profile + timeout_seconds: Maximum execution time before timeout + + Returns: + tuple[str, float]: (Profiler output, execution time in milliseconds) + """ + import sys + import time + import torch + from io import StringIO + + output = StringIO() + sys.stdout = output + + try: + with timeout(timeout_seconds): + local_vars = {} + + profiler_script = """ +import torch +from torch.profiler import profile, record_function, ProfilerActivity + +with profile( + activities=[ + ProfilerActivity.CPU, + ProfilerActivity.CUDA, + ], + record_shapes=True, + profile_memory=True, + with_stack=True +) as prof: + with record_function("model_inference"): +{} +print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10)) +""".format('\n'.join(' ' + line for line in script_content.splitlines())) + + execution_start_time = time.perf_counter() + exec(profiler_script, {}, local_vars) + execution_time_ms = (time.perf_counter() - execution_start_time) * 1000 + + return output.getvalue(), execution_time_ms + + except TimeoutException as e: + return f"Timeout Error: {str(e)}", 0.0 + except Exception as e: + return f"Error executing script: {str(e)}", 0.0 + finally: + sys.stdout = sys.__stdout__