generate_input receives a random seed argument

gpu-mode · Jan 14, 2025 · 35785a9 · 35785a9
1 parent d26faaf
commit 35785a9
Show file tree

Hide file tree

Showing 5 changed files with 50 additions and 25 deletions.
diff --git a/examples/identity_cuda/reference.cuh b/examples/identity_cuda/reference.cuh
@@ -6,6 +6,7 @@
 #include <cstdlib>
 #include <cmath>
 #include <array>
+#include <random>
 #include <iostream>
 
 #define N_SIZES 10
@@ -15,13 +16,16 @@ const int Ns[N_SIZES] = {128,  256,  512,   1024,  2048,
 using input_t = std::array<std::vector<float>, N_SIZES>;
 using output_t = input_t;
 
-input_t generate_input() {
+input_t generate_input(int seed) {
+  std::mt19937 rng(seed);
   input_t data;
 
+  std::uniform_real_distribution<float> dist(0, 1);
+
   for (int i = 0; i < N_SIZES; ++i) {
     data[i].resize(Ns[i]);
     for (int j = 0; j < Ns[i]; ++j) {
-      data[i][j] = static_cast<float>(rand()) / RAND_MAX;
+      data[i][j] = dist(rng);
     }
   }
 

diff --git a/examples/identity_py/reference.py b/examples/identity_py/reference.py
@@ -16,7 +16,7 @@ def ref_kernel(xs: List[torch.Tensor]) -> List[torch.Tensor]:
     return xs
 
 
-def generate_input() -> List[torch.Tensor]:
+def generate_input(seed: int) -> List[torch.Tensor]:
     """
     Generates random input tensor of the specified shape.
     Returns:
@@ -34,8 +34,10 @@ def generate_input() -> List[torch.Tensor]:
         device = torch.device("cpu")
 
     tensors = []
+    rng = torch.Generator(device=device)
+    rng.manual_seed(seed)
     for shape in shapes:
-        tensors.append(torch.randn(shape, device=device))
+        tensors.append(torch.randn(shape, device=device, generator=rng))
 
     return tensors
 

diff --git a/src/discord-cluster-manager/eval.cu b/src/discord-cluster-manager/eval.cu
@@ -52,21 +52,24 @@ static void cuda_check(cudaError_t status, const char* expr, const char* file, i
 
 #define cuda_check(expr) cuda_check(expr, #expr, __FILE__, __LINE__, __FUNCTION__)
 
-void measure_runtime(PopcornOutput& logger) {
+void measure_runtime(PopcornOutput& logger, std::mt19937& rng) {
     std::cout << "warming up..." << std::endl;
 
-    for (int i = 0; i < WARMUP_RUNS; i++) {
-        auto data = generate_input();
-        // discard result; this is just warmup, we don't care what it returns
-        (void)custom_kernel(data);
+    {
+        auto warmup_data = generate_input(rng());
+        for (int i = 0; i < WARMUP_RUNS; i++) {
+            // discard result; this is just warmup, we don't care what it returns
+            (void)custom_kernel(warmup_data);
+            cuda_check(cudaDeviceSynchronize());
+        }
     }
-    cuda_check(cudaDeviceSynchronize());
 
     std::vector<std::int64_t> durations;
     durations.reserve(TIMED_RUNS);
 
     for (int i = 0; i < TIMED_RUNS; i++) {
-        auto data = generate_input();
+        auto data = generate_input(rng());
+
         // make a copy of the input data to be used by the reference implementation
         auto copy = data;
 
@@ -124,7 +127,15 @@ int main() {
         return 111;
     }
 
-    auto data = generate_input();
+    // get the seed
+    const char *seed_str = std::getenv("POPCORN_SEED");
+    int seed = 42;
+    if (seed_str) {
+        seed = std::stoi(output_fd);
+    }
+
+    std::mt19937 rng(seed);
+    auto data = generate_input(rng());
     auto reference_output = ref_kernel(data);
     auto submission_output = custom_kernel(data);
 
@@ -133,6 +144,6 @@ int main() {
         return 112;
     }
 
-    measure_runtime(logger);
+    measure_runtime(logger, rng);
     return 0;
 }
diff --git a/src/discord-cluster-manager/eval.py b/src/discord-cluster-manager/eval.py
@@ -16,10 +16,9 @@ def log(self, key: str, value):
         print(f"{key}: {value}\n", file=self.channel)
 
 
-def correctness() -> bool:
+def correctness(rng: torch.Generator) -> bool:
     for _ in range(10):  # check multiple times
-        inputs = generate_input()
-
+        inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
         custom_output = custom_kernel(inputs)
         ref_output = ref_kernel(inputs)
 
@@ -30,22 +29,22 @@ def correctness() -> bool:
     return True
 
 
-def metric(logger: PopcornLogger):
+def metric(logger: PopcornLogger, rng: torch.Generator):
     warmup_runs = 10
     timed_runs = 100
 
     # Warmup Code
     print("warming up...")
     for _ in range(warmup_runs):
-        inputs = generate_input()
+        inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
         _ = custom_kernel(inputs)
     torch.cuda.synchronize()
 
     # Timing Code
     times = []
 
     for _ in range(timed_runs):
-        inputs = generate_input()
+        inputs = generate_input(torch.randint(0, int(2**31), (), generator=rng).item())
 
         start_time = time.time()
         custom_output = custom_kernel(inputs)
@@ -82,10 +81,14 @@ def main():
         print(e, file=sys.stderr)
         exit(111)
 
-    if not correctness():
+    seed = int(os.environ.get("POPCORN_FD", 42))
+    rng = torch.Generator()
+    rng.manual_seed(seed)
+
+    if not correctness(rng):
         logger.log("check", "fail")
         exit(112)
-    metric(logger)
+    metric(logger, rng)
 
 
 if __name__ == "__main__":

diff --git a/src/discord-cluster-manager/run_eval.py b/src/discord-cluster-manager/run_eval.py
@@ -52,7 +52,7 @@ def compile_cuda_script(  # # noqa: C901
         arch: Architecture to compile for. If None, uses `native`
         include_dirs: additional include directories to supply to nvcc
         verbose: whether to print progress or be silent
-
+        seed: Seed value to use for generating test cases
     Returns:
         A `CompileResult` that summarizes the compilation process.
 
@@ -114,11 +114,12 @@ def compile_cuda_script(  # # noqa: C901
     )
 
 
-def run_program(args: list[str]) -> RunResult:
+def run_program(args: list[str], seed: int) -> RunResult:
     # set up a pipe so the tester can communicate its verdict with us
     env = os.environ.copy()
     pipe_read, pipe_write = os.pipe()
     env["POPCORN_FD"] = str(pipe_write)
+    env["POPCORN_SEED"] = str(seed)
 
     execution_start_time = time.perf_counter()
     run_process = subprocess.run(
@@ -158,6 +159,7 @@ def run_cuda_script(  # # noqa: C901
     submission_content: str = None,
     arch: int = None,
     include_dirs: list[str] = None,
+    seed: int = 42,
 ) -> tuple[CompileResult, RunResult]:
     """
     Executes the provided CUDA kernel in an isolated environment with a timeout
@@ -168,6 +170,7 @@ def run_cuda_script(  # # noqa: C901
         submission_content: The (optional) submission code, used for leaderboards.
         arch: The arch code for the compute/sm versions. If None, native arch is used.
         include_dirs: Additional include directories, e.g., for thunderkittens/cutlass etc
+        seed: Random seed to initialize the RNG for testing
 
     Returns:
         tuple[CompileResult, RunResult]: CUDA compile/eval result information
@@ -206,7 +209,7 @@ def run_cuda_script(  # # noqa: C901
                 result={},
             )
 
-        run_result = run_program(["./eval.out"])
+        run_result = run_program(["./eval.out"], seed=seed)
         return compile_result, run_result
 
     finally:
@@ -221,6 +224,7 @@ def run_pytorch_script(  # noqa: C901
     reference_content: Optional[str] = None,
     submission_content: Optional[str] = None,
     arch: int = None,
+    seed: int = 42,
 ) -> RunResult:
     """
     Executes the provided PyTorch GPU kernel in an isolated environment
@@ -230,6 +234,7 @@ def run_pytorch_script(  # noqa: C901
         reference_content: The (optional) reference code, used for leaderboards.
         submission_content: The (optional) submission code, used for leaderboards.
         arch: The arch code for the compute/sm versions.
+        seed: Random seed to initialize the RNG for testing
 
     Returns:
         tuple[str, float]: (Kernel output, execution time in milliseconds)
@@ -247,7 +252,7 @@ def run_pytorch_script(  # noqa: C901
         with open("eval.py", "w") as f:
             f.write(script_content)
 
-        return run_program(["python", "eval.py"])
+        return run_program(["python", "eval.py"], seed=seed)
 
     finally:
         tmp_files = ["eval.py", "reference.py", "train.py"]