Skip to content

Commit

Permalink
fix db support for new gh/modal runners and verifiers
Browse files Browse the repository at this point in the history
  • Loading branch information
alexzhang13 committed Jan 15, 2025
1 parent 990fdce commit 95ffba1
Show file tree
Hide file tree
Showing 5 changed files with 48 additions and 68 deletions.
6 changes: 3 additions & 3 deletions src/discord-cluster-manager/cogs/github_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,12 +44,12 @@ async def run_github(
gpu_type: app_commands.Choice[str],
reference_script: discord.Attachment = None,
reference_code: str = None,
) -> discord.Thread:
) -> tuple[discord.Thread, FullResult]:
if not script.filename.endswith((".py", ".cu", ".cuh", ".cpp")):
await send_discord_message(
interaction, "Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file"
)
return None
return None, None

thread = await self.bot.create_thread(interaction, gpu_type.name, "GitHub Job")
await thread.send(f"Processing `{script.filename}` with {gpu_type.name}...")
Expand Down Expand Up @@ -98,7 +98,7 @@ async def run_github(
"Failed to trigger GitHub Action. Please check the configuration."
)

return thread
return thread, result

except Exception as e:
logger.error(f"Error processing request: {str(e)}", exc_info=True)
Expand Down
72 changes: 28 additions & 44 deletions src/discord-cluster-manager/cogs/leaderboard_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@
from ui.misc import DeleteConfirmationModal, GPUSelectionView
from ui.table import create_table
from utils import (
extract_score,
get_user_from_id,
send_discord_message,
setup_logging,
Expand All @@ -44,7 +43,7 @@ async def async_submit_cog_job(
runner_name: str = "GitHub",
):
try:
discord_thread = await command.callback(
discord_thread, result = await command.callback(
cog,
interaction,
script,
Expand All @@ -58,55 +57,40 @@ async def async_submit_cog_job(
print(f"Webhook not found: {e}")
await send_discord_message(interaction, "❌ The webhook was not found.")

message_contents = [msg.content async for msg in discord_thread.history(limit=None)]

try:
# For CUDA leaderboards, make more robust
if "check_implementation failed" in message_contents:
await send_discord_message(
interaction,
"check_implementation failed. User kernel and reference kernel do not match.",
ephemeral=True,
)
return

# TODO: Make this more robust later
score = extract_score("".join(message_contents))
if result.success:
score = float(result.run.result["duration.mean"]) / 1e9

with self.bot.leaderboard_db as db:
db.create_submission(
{
"submission_name": script.filename,
"submission_time": datetime.now(),
"leaderboard_name": leaderboard_name,
"code": submission_content,
"user_id": interaction.user.id,
"submission_score": score,
"gpu_type": gpu.name,
}
)

with self.bot.leaderboard_db as db:
db.create_submission(
{
"submission_name": script.filename,
"submission_time": datetime.now(),
"leaderboard_name": leaderboard_name,
"code": submission_content,
"user_id": interaction.user.id,
"submission_score": score,
"gpu_type": gpu.name,
}
user_id = (
interaction.user.global_name
if interaction.user.nick is None
else interaction.user.nick
)

user_id = (
interaction.user.global_name
if interaction.user.nick is None
else interaction.user.nick
)

await send_discord_message(
interaction,
f"Successfully ran on {gpu.name} using {runner_name} runners!\n"
+ f"Leaderboard '{leaderboard_name}'.\n"
+ f"Submission title: {script.filename}.\n"
+ f"Submission user: {user_id}.\n"
+ f"Runtime: {score:.9f} seconds.",
ephemeral=True,
)
await discord_thread.send(
f"Successfully ran on {gpu.name} using {runner_name} runners!\n"
+ f"Leaderboard '{leaderboard_name}'.\n"
+ f"Submission title: {script.filename}.\n"
+ f"Submission user: {user_id}.\n"
+ f"Runtime: {score:.9f} seconds.",
)
except Exception:
await send_discord_message(
interaction,
await discord_thread.send(
f"Leaderboard submission to '{leaderboard_name}' on {gpu.name} "
+ f"using {runner_name} runners failed!\n",
ephemeral=True,
)

async def select_gpu_view(
Expand Down
31 changes: 13 additions & 18 deletions src/discord-cluster-manager/cogs/modal_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@
from discord.ext import commands
from leaderboard_eval import cu_eval, py_eval
from report import generate_report
from utils import send_discord_message, send_logs, setup_logging
from run_eval import FullResult
from utils import send_discord_message, setup_logging

logger = setup_logging()

Expand All @@ -34,7 +35,7 @@ async def run_modal(
gpu_type: app_commands.Choice[str],
reference_script: Optional[discord.Attachment] = None,
reference_code: str = None,
) -> discord.Thread:
) -> tuple[discord.Thread, FullResult]:
thread = None
status_msg = None
try:
Expand All @@ -44,11 +45,11 @@ async def run_modal(
"Please provide a Python (.py) or CUDA (.cu / .cuh / .cpp) file",
ephemeral=True,
)
return None
return None, None

# TODO: Maybe find a better way?
if not interaction.response.is_done():
await interaction.response.defer(ephemeral=True)

channel = interaction.channel
message = await channel.send(f"Starting Modal job with {gpu_type.name}...")
thread = await message.create_thread(name=f"{gpu_type.name} Modal Job")
Expand All @@ -67,7 +68,7 @@ async def run_modal(
else (await reference_script.read()).decode("utf-8")
)

await self.handle_modal_execution(
result = await self.handle_modal_execution(
interaction,
thread,
script_content,
Expand All @@ -76,7 +77,7 @@ async def run_modal(
reference_content,
status_msg,
)
return thread
return thread, result

except Exception as e:
logger.error(f"Error processing request: {str(e)}", exc_info=True)
Expand All @@ -94,7 +95,7 @@ async def handle_modal_execution(
gpu_type: str,
reference_content: Optional[str],
status_msg: discord.Message,
):
) -> FullResult:
try:
loop = asyncio.get_event_loop()
func_type = "pytorch" if filename.endswith(".py") else "cuda"
Expand All @@ -113,9 +114,11 @@ async def handle_modal_execution(
# Send results
await thread.send(f"\n**Script size:** {len(script_content)} bytes")
await generate_report(thread, result)
return result

else:
result, score = await loop.run_in_executor(
# Currently broken?
result = await loop.run_in_executor(
None,
lambda: modal.Function.lookup("discord-bot-runner", func_name).remote(
script_content,
Expand All @@ -127,18 +130,10 @@ async def handle_modal_execution(

# Send results
await thread.send(f"\n**Script size:** {len(script_content)} bytes")
await thread.send(f"**Execution time:** {score:.3f} s\n")

if "check_implementation failed" in result or "Error" in result:
await thread.send("Modal run failed.\n")
await send_logs(thread, result)
await status_msg.edit(content="**Running on Modal...**\n> ❌ Job failed!")
return result, 0

if result is not None:
await thread.send(f"**score:{score:.9f}**\n```")
await thread.send(f"**Execution time:** {result.run.duration:.3f} s\n")

await status_msg.edit(content="**Running on Modal...**\n> ✅ Job completed!")
return result

except Exception as e:
logger.error(f"Error in handle_modal_execution: {str(e)}", exc_info=True)
Expand Down
3 changes: 2 additions & 1 deletion src/discord-cluster-manager/cogs/verify_run_cog.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,8 @@ async def verify_github_run(
]
await send_discord_message(
interaction,
f"❌ GitHub run ({choice.name}) for {lang} verification failed. Missing expected messages:\n"
f"❌ GitHub run ({choice.name}) for {lang} verification failed. "
+ "Missing expected messages:\n"
+ "\n".join(f"- {pattern}" for pattern in missing_patterns),
)
return False
Expand Down
4 changes: 2 additions & 2 deletions src/discord-cluster-manager/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ async def _send_split_log(thread: discord.Thread, partial_message: str, header:
return ""


async def generate_report(thread: discord.Thread, result: FullResult):
async def generate_report(thread: discord.Thread, result: FullResult): # noqa: C901
message = ""
if not result.success:
message += "# Failure\n"
Expand Down Expand Up @@ -106,7 +106,7 @@ async def generate_report(thread: discord.Thread, result: FullResult):
if len(message) != 0:
await thread.send(message)

# TODO dedicated "error" entry in our results dict that gets populated by check_implementation
# TODO dedicated "error" entry in our results that gets populated by check_implementation
return

# OK, we were successful
Expand Down

0 comments on commit 95ffba1

Please sign in to comment.