diff --git a/.github/workflows/nvidia_workflow.yml b/.github/workflows/nvidia_workflow.yml index 1d6a191..88db57c 100644 --- a/.github/workflows/nvidia_workflow.yml +++ b/.github/workflows/nvidia_workflow.yml @@ -99,7 +99,7 @@ jobs: shell: bash run: | if [[ -n "${{ github.event.inputs.eval_content }}" ]]; then - if [[ "${{ github.event.inputs.eval_content }}" == *.cu ]]; then + if [[ "${{ github.event.inputs.eval_filename }}" == *.cu ]]; then echo "Compiling and running CUDA file..." nvcc "${{ github.event.inputs.eval_filename }}" -o cuda_program ./cuda_program > training.log 2>&1 diff --git a/src/discord-cluster-manager/cogs/leaderboard_cog.py b/src/discord-cluster-manager/cogs/leaderboard_cog.py index f0aaeb5..ed03ef2 100644 --- a/src/discord-cluster-manager/cogs/leaderboard_cog.py +++ b/src/discord-cluster-manager/cogs/leaderboard_cog.py @@ -39,9 +39,7 @@ async def submit( gpu_type="Choose the GPU type for Modal", ) @app_commands.choices( - gpu_type=[ - app_commands.Choice(name=gpu.value, value=gpu.value) for gpu in ModalGPU - ] + gpu_type=[app_commands.Choice(name=gpu.value, value=gpu.value) for gpu in ModalGPU] ) async def submit_modal( self, @@ -79,7 +77,7 @@ async def submit_modal( f"Ran on Modal. Leaderboard '{leaderboard_name}'.\n" + f"Submission title: {script.filename}.\n" + f"Submission user: {interaction.user.id}.\n" - + f"Runtime: {score} ms", + + f"Runtime: {score:.9f} seconds.", ephemeral=True, ) except ValueError: @@ -90,16 +88,12 @@ async def submit_modal( ) ### GITHUB SUBCOMMAND - @app_commands.command( - name="github", description="Submit leaderboard data for GitHub" - ) + @app_commands.command(name="github", description="Submit leaderboard data for GitHub") @app_commands.describe( gpu_type="Choose the GPU type for Github Runners", ) @app_commands.choices( - gpu_type=[ - app_commands.Choice(name=gpu.name, value=gpu.value) for gpu in GitHubGPU - ] + gpu_type=[app_commands.Choice(name=gpu.name, value=gpu.value) for gpu in GitHubGPU] ) async def submit_github( self, @@ -157,9 +151,7 @@ async def submit_github( print(f"Webhook not found: {e}") await send_discord_message(interaction, "❌ The webhook was not found.") - message_contents = [ - msg.content async for msg in github_thread.history(limit=None) - ] + message_contents = [msg.content async for msg in github_thread.history(limit=None)] # Compute eval or submission score, call runner here. # TODO: Make this more robust later @@ -180,14 +172,17 @@ async def submit_github( if interaction.user.nick is None else interaction.user.nick ) + await send_discord_message( interaction, "Successfully ran on GitHub runners!\n" + f"Leaderboard '{leaderboard_name}'.\n" + f"Submission title: {script.filename}.\n" - + f"Submission user: {user_id}\n" - + f"Runtime: {score} ms\n", + + f"Submission user: {user_id}.\n" + + f"Runtime: {score:.9f} seconds.", + ephemeral=True, ) + except ValueError: await send_discord_message( interaction, @@ -225,9 +220,7 @@ async def select_callback(self, interaction: Interaction): class LeaderboardCog(commands.Cog): def __init__(self, bot): self.bot: commands.Bot = bot - self.get_leaderboards = bot.leaderboard_group.command(name="list")( - self.get_leaderboards - ) + self.get_leaderboards = bot.leaderboard_group.command(name="list")(self.get_leaderboards) self.leaderboard_create = bot.leaderboard_group.command( name="create", description="Create a new leaderboard" )(self.leaderboard_create) @@ -246,9 +239,7 @@ async def get_leaderboards(self, interaction: discord.Interaction): leaderboards = db.get_leaderboards() if not leaderboards: - await send_discord_message( - interaction, "No leaderboards found.", ephemeral=True - ) + await send_discord_message(interaction, "No leaderboards found.", ephemeral=True) return # Create embed @@ -257,9 +248,7 @@ async def get_leaderboards(self, interaction: discord.Interaction): # Add fields for each leaderboard for lb in leaderboards: deadline_str = lb["deadline"].strftime("%Y-%m-%d %H:%M") - embed.add_field( - name=lb["name"], value=f"Deadline: {deadline_str}", inline=False - ) + embed.add_field(name=lb["name"], value=f"Deadline: {deadline_str}", inline=False) await interaction.followup.send(interaction, embed=embed) @@ -318,7 +307,7 @@ async def leaderboard_create( if "duplicate key" in err: await send_discord_message( interaction, - 'Error: Tried to create a leaderboard ' + "Error: Tried to create a leaderboard " f'"{leaderboard_name}" that already exists.', ephemeral=True, ) @@ -383,9 +372,7 @@ async def get_leaderboard_submissions( ) for submission in submissions: - user_id = await get_user_from_id( - submission["user_id"], interaction, self.bot - ) + user_id = await get_user_from_id(submission["user_id"], interaction, self.bot) embed.add_field( name=f"{user_id}: {submission['submission_name']}", diff --git a/src/discord-cluster-manager/leaderboard_eval.py b/src/discord-cluster-manager/leaderboard_eval.py index 2d4992f..94e5dc3 100644 --- a/src/discord-cluster-manager/leaderboard_eval.py +++ b/src/discord-cluster-manager/leaderboard_eval.py @@ -3,10 +3,57 @@ ######## py_eval = """ -from reference import metric +import torch +import time +from reference import ref_kernel, generate_input +from train import custom_kernel + + +def check_implementation() -> bool: + for _ in range(10): # check multiple times + input_tensors = generate_input() + for input in input_tensors: + custom_output = custom_kernel(input) + ref_output = ref_kernel(input) + + if not torch.allclose(custom_output, ref_output, atol=1e-5): + print('mismatch found! custom implementation doesnt match reference.') + return False + + print('custom implementation matches the reference implementation.') + return True + + +def metric(): + warmup_runs = 10 + timed_runs = 100 + + # warmup + print('warming up...') + for _ in range(warmup_runs): + input_tensors = generate_input() + for input in input_tensors: + _ = custom_kernel(input) + _ = ref_kernel(input) + + # timing + print('timing custom implementation...') + input_tensor = generate_input() + start_time = time.time() + for _ in range(timed_runs): + for input in input_tensors: + _ = custom_kernel(input) + + custom_duration = (time.time() - start_time) / timed_runs + + print(f'submitted kernel runtime: {custom_duration:.4f} seconds') + + return custom_duration def main(): + assert (check_implementation()) s = metric() + print(f'score:{s}') if __name__ == '__main__':