Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Pytorch Profiler #76

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
83 changes: 83 additions & 0 deletions src/discord-cluster-manager/cogs/github_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -241,3 +241,86 @@ async def download_artifact(self, run_id):
return "No training artifacts found"
except Exception as e:
return f"Error downloading artifacts: {str(e)}"

@app_commands.command(name="profile", description="Profile using pytorch profiler on GitHub runners")
@app_commands.describe(
script="The PyTorch model to profile",
gpu_type="Choose the GPU type for GitHub Actions",
)
@app_commands.choices(
gpu_type=[
app_commands.Choice(name="NVIDIA", value="nvidia"),
app_commands.Choice(name="AMD", value="amd"),
]
)
async def profile_github(
self,
interaction: discord.Interaction,
script: discord.Attachment,
gpu_type: app_commands.Choice[str],
) -> discord.Thread:
if not script.filename.endswith(".py"):
await send_discord_message(
interaction,
"Please provide a Python (.py) file with PyTorch code to profile using the PyTorch profiler.",
ephemeral=True
)
return None

thread = await self.bot.create_thread(interaction, f"{gpu_type.name}-profile", "GitHub Profiling")
message = f"Created thread {thread.mention} for your GitHub profiling job"

await send_discord_message(interaction, message)
await thread.send(f"**Profiling `{script.filename}` with {gpu_type.name}...**")

try:
script_content = (await script.read()).decode("utf-8")
selected_gpu = GPUType.AMD if gpu_type.value == "amd" else GPUType.NVIDIA

profiler_script = """
import torch
from torch.profiler import profile, record_function, ProfilerActivity

with profile(
activities=[
ProfilerActivity.CPU,
ProfilerActivity.CUDA,
],
record_shapes=True,
profile_memory=True,
with_stack=True
) as prof:
with record_function("model_inference"):
{}
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
""".format('\n'.join(' ' + line for line in script_content.splitlines()))

run_id = await self.trigger_github_action(
profiler_script,
script.filename,
selected_gpu
)

if run_id:
await thread.send(f"GitHub Action triggered! Run ID: {run_id}\nMonitoring progress...")
status, logs, url = await self.check_workflow_status(run_id, thread)

await thread.send(f"Profiling completed with status: {status}")

if len(logs) > 1900:
await self.bot.send_chunked_message(thread, logs, code_block=True)
else:
await thread.send(f"```\nProfiler output:\n{logs}\n```")

if url:
await thread.send(f"View the full run at: <{url}>")
else:
await thread.send("Failed to trigger GitHub Action. Please check the configuration.")

return thread

except Exception as e:
logger.error(f"Error processing profiling request: {str(e)}", exc_info=True)
if thread:
await thread.send(f"Error processing profiling request: {str(e)}")
raise
68 changes: 68 additions & 0 deletions src/discord-cluster-manager/cogs/modal_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
from discord.ext import commands
from utils import send_discord_message, setup_logging

# profiler imports
from modal_runner import modal_app
from modal_runner import run_profile_pytorch_script

logger = setup_logging()

class ModalCog(commands.Cog):
Expand Down Expand Up @@ -98,3 +102,67 @@ async def trigger_modal_run(self, script_content: str, filename: str) -> tuple[s
except Exception as e:
logger.error(f"Error in trigger_modal_run: {str(e)}", exc_info=True)
return f"Error: {str(e)}", 0

@app_commands.command(name="profile", description="Profile using pytorch profiler")
@app_commands.describe(
script="The PyTorch model to profile",
gpu_type="Choose the GPU type for Modal"
)
@app_commands.choices(
gpu_type=[
app_commands.Choice(name="NVIDIA T4", value="t4"),
]
)
async def profile_modal(
self,
interaction: discord.Interaction,
script: discord.Attachment,
gpu_type: app_commands.Choice[str],
) -> discord.Thread:
thread = None
try:
if not script.filename.endswith(".py"):
await send_discord_message(
interaction,
"Please provide a Python (.py) file with PyTorch code to profile using the PyTorch profiler.",
ephemeral=True
)
return None

thread = await self.bot.create_thread(interaction, f"{gpu_type.name}-{script.filename}-profile", "Modal Profiling")
message = f"Created thread {thread.mention} for your Modal profiling job"

await send_discord_message(interaction, message)
await thread.send(f"**Profiling `{script.filename}` with {gpu_type.name}...**")

script_content = (await script.read()).decode("utf-8")
status_msg = await thread.send("**Running profiler on Modal...**\n> ⏳ Waiting for available GPU...")

result, execution_time_ms = await self.trigger_modal_profile(script_content)

await status_msg.edit(content="**Running profiler on Modal...**\n> ✅ Profiling completed!")
await thread.send(f"**Profiling results:**\n```\n{result}\n```")

return thread

except Exception as e:
logger.error(f"Error processing profiling request: {str(e)}", exc_info=True)
if thread:
await status_msg.edit(content="**Running profiler on Modal...**\n> ❌ Profiling failed!")
await thread.send(f"**Error:** {str(e)}")
raise

async def trigger_modal_profile(self, script_content: str) -> tuple[str, float]:
logger.info("Attempting to trigger Modal profiling run")

try:
print("Running profiler with Modal")
with modal.enable_output():
with modal_app.run():
result, execution_time_ms = run_profile_pytorch_script.remote(script_content)

return result, execution_time_ms

except Exception as e:
logger.error(f"Error in trigger_modal_profile: {str(e)}", exc_info=True)
return f"Error: {str(e)}", 0
40 changes: 40 additions & 0 deletions src/discord-cluster-manager/cogs/verify_run_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,46 @@ async def verify_modal_run(
)
return False

async def verify_github_profile(
self, github_cog: GitHubCog, gpu_type: app_commands.Choice[str], interaction: discord.Interaction
) -> bool:
github_thread = await github_cog.profile_github(
interaction, script_file, gpu_type
)

message_contents = [msg.content async for msg in github_thread.history(limit=None)]

required_patterns = [
"Profiling `.*` with",
"GitHub Action triggered!",
"Profiling completed with status:",
"Profiler output:",
]

all_patterns_found = all(
any(re.search(pattern, content, re.DOTALL) is not None for content in message_contents)
for pattern in required_patterns
)

if all_patterns_found:
await send_discord_message(
interaction,
f"✅ GitHub {gpu_type.name} profiling completed successfully - all expected messages found!",
)
return True
else:
missing_patterns = [
pattern
for pattern in required_patterns
if not any(re.search(pattern, content, re.DOTALL) for content in message_contents)
]
await send_discord_message(
interaction,
f"❌ GitHub {gpu_type.name} profiling verification failed. Missing expected messages:\n"
+ "\n".join(f"- {pattern}" for pattern in missing_patterns),
)
return False

@app_commands.command(name="verifyruns")
async def verify_runs(self, interaction: discord.Interaction):
"""Verify runs on on Modal, GitHub Nvidia, and GitHub AMD."""
Expand Down
55 changes: 55 additions & 0 deletions src/discord-cluster-manager/modal_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,3 +146,58 @@ def run_cuda_script(script_content: str, timeout_seconds: int = 600) -> tuple[st
if os.path.exists("script.out"):
os.remove("script.out")
sys.stdout = sys.__stdout__


def run_profile_pytorch_script(script_content: str, timeout_seconds: int = 300) -> tuple[str, float]:
"""
Profiles the provided PyTorch script using torch.profiler

Args:
script_content: The PyTorch script to profile
timeout_seconds: Maximum execution time before timeout

Returns:
tuple[str, float]: (Profiler output, execution time in milliseconds)
"""
import sys
import time
import torch
from io import StringIO

output = StringIO()
sys.stdout = output

try:
with timeout(timeout_seconds):
local_vars = {}

profiler_script = """
import torch
from torch.profiler import profile, record_function, ProfilerActivity

with profile(
activities=[
ProfilerActivity.CPU,
ProfilerActivity.CUDA,
],
record_shapes=True,
profile_memory=True,
with_stack=True
) as prof:
with record_function("model_inference"):
{}
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
""".format('\n'.join(' ' + line for line in script_content.splitlines()))

execution_start_time = time.perf_counter()
exec(profiler_script, {}, local_vars)
execution_time_ms = (time.perf_counter() - execution_start_time) * 1000

return output.getvalue(), execution_time_ms

except TimeoutException as e:
return f"Timeout Error: {str(e)}", 0.0
except Exception as e:
return f"Error executing script: {str(e)}", 0.0
finally:
sys.stdout = sys.__stdout__
Loading