Fix bug in artifact creation of the LX6 instructions (#453)

There was a bug where were appending LX6 instructions to an existing vector for subsequent entry points rather than making a new vector for each entry point. This change fixes that and hence fixes #447 which was indeed a kernel time out due to bad artifacts. Also adds a multi-dispatch e2e test to CI.
nod-ai · Jun 24, 2024 · 1ed8257 · 1ed8257
1 parent db0e636
commit 1ed8257
Show file tree

Hide file tree

Showing 3 changed files with 36 additions and 1 deletion.
diff --git a/build_tools/ci/cpu_comparison/run_test.sh b/build_tools/ci/cpu_comparison/run_test.sh
@@ -326,6 +326,9 @@ function run_test() {
 # Example of running a test directly from an .mlir file with a function.
 run_test --test_file ${THIS_DIR}/test_files/matmul_int32.mlir
 
+# An example of an arbitrary graph with three matmuls which form three dispatches.
+run_test --test_file ${THIS_DIR}/test_files/three_matmuls.mlir
+
 # Example of generating a matmul test from a template, and then running it.
 test_name=${OUTPUT_DIR}/test_from_template.mlir
 matmul_template_dir=${THIS_DIR}/matmul_template

diff --git a/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir b/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir
@@ -0,0 +1,31 @@
+// This test shows arbitrary matmuls that would have producer consumer relationships
+// across different dispatches running on CI.
+
+// These 4 lines are required by the script which generates input data:
+//
+// input 32x32xf32
+// input 32x32xf32
+// input 32x4xf32
+// input 4x32xf32
+
+!A_TYPE = tensor<32x32xf32>
+!B_TYPE = tensor<32x4xf32>
+!C_TYPE = tensor<4x32xf32>
+!D_TYPE = tensor<4x4xf32>
+func.func @two_mm(%lhs : !A_TYPE,
+    %rhs : !A_TYPE, %rhs_2 : !B_TYPE, %lhs_2 : !C_TYPE) -> !D_TYPE {
+  %empty = tensor.empty() : !A_TYPE
+  %empty_2 = tensor.empty() : !B_TYPE
+  %empty_3 = tensor.empty() : !D_TYPE
+  %cst = arith.constant 0.0 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : !A_TYPE) -> !A_TYPE
+  %fill_2 = linalg.fill ins(%cst : f32) outs(%empty_2 : !B_TYPE) -> !B_TYPE
+  %fill_3 = linalg.fill ins(%cst : f32) outs(%empty_3 : !D_TYPE) -> !D_TYPE
+  %2 = linalg.matmul ins(%lhs, %rhs : !A_TYPE, !A_TYPE)
+      outs(%fill : !A_TYPE) -> !A_TYPE
+  %3 = linalg.matmul ins(%2, %rhs_2 : !A_TYPE, !B_TYPE)
+      outs(%fill_2 : !B_TYPE) -> !B_TYPE
+  %4 = linalg.matmul ins(%lhs_2, %3 : !C_TYPE, !B_TYPE)
+      outs(%fill_3 : !D_TYPE) -> !D_TYPE
+  return %4 : !D_TYPE
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
@@ -253,7 +253,6 @@ LogicalResult AIETargetBackend::serializeExecutable(
 
   SmallString<128> aie2xclbin(options.mlirAieInstallDir);
   llvm::sys::path::append(aie2xclbin, "bin", "aie2xclbin");
-  std::vector<uint32_t> npuInstrs;
   std::unique_ptr<llvm::MemoryBuffer> xclbinIn;
 
   FlatbufferBuilder builder;
@@ -359,6 +358,8 @@ LogicalResult AIETargetBackend::serializeExecutable(
 
     std::ifstream instrFile(static_cast<std::string>(npuInstPath));
     std::string line;
+    // Vector to store LX6 instructions.
+    std::vector<uint32_t> npuInstrs;
     while (std::getline(instrFile, line)) {
       std::istringstream iss(line);
       uint32_t a;