From 1ed825730c256b9e1c21a2280fa89631e1e2001f Mon Sep 17 00:00:00 2001
From: Nirvedh Meshram <96096277+nirvedhmeshram@users.noreply.github.com>
Date: Mon, 24 Jun 2024 12:02:08 -0500
Subject: [PATCH] Fix bug in artifact creation of the LX6 instructions (#453)

There was a bug where were appending LX6 instructions to an existing
vector for subsequent entry points rather than making a new vector for
each entry point. This change fixes that and hence fixes
https://github.com/nod-ai/iree-amd-aie/issues/447 which was indeed a
kernel time out due to bad artifacts.
Also adds a multi-dispatch e2e test to CI.
---
 build_tools/ci/cpu_comparison/run_test.sh     |  3 ++
 .../test_files/three_matmuls.mlir             | 31 +++++++++++++++++++
 .../AMD-AIE/iree-amd-aie/Target/AIETarget.cpp |  3 +-
 3 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir

diff --git a/build_tools/ci/cpu_comparison/run_test.sh b/build_tools/ci/cpu_comparison/run_test.sh
index 40fef78d9..93362daea 100755
--- a/build_tools/ci/cpu_comparison/run_test.sh
+++ b/build_tools/ci/cpu_comparison/run_test.sh
@@ -326,6 +326,9 @@ function run_test() {
 # Example of running a test directly from an .mlir file with a function.
 run_test --test_file ${THIS_DIR}/test_files/matmul_int32.mlir
 
+# An example of an arbitrary graph with three matmuls which form three dispatches.
+run_test --test_file ${THIS_DIR}/test_files/three_matmuls.mlir
+
 # Example of generating a matmul test from a template, and then running it.
 test_name=${OUTPUT_DIR}/test_from_template.mlir
 matmul_template_dir=${THIS_DIR}/matmul_template
diff --git a/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir b/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir
new file mode 100644
index 000000000..fe4fd3fcf
--- /dev/null
+++ b/build_tools/ci/cpu_comparison/test_files/three_matmuls.mlir
@@ -0,0 +1,31 @@
+// This test shows arbitrary matmuls that would have producer consumer relationships
+// across different dispatches running on CI.
+
+// These 4 lines are required by the script which generates input data:
+//
+// input 32x32xf32
+// input 32x32xf32
+// input 32x4xf32
+// input 4x32xf32
+
+!A_TYPE = tensor<32x32xf32>
+!B_TYPE = tensor<32x4xf32>
+!C_TYPE = tensor<4x32xf32>
+!D_TYPE = tensor<4x4xf32>
+func.func @two_mm(%lhs : !A_TYPE,
+    %rhs : !A_TYPE, %rhs_2 : !B_TYPE, %lhs_2 : !C_TYPE) -> !D_TYPE {
+  %empty = tensor.empty() : !A_TYPE
+  %empty_2 = tensor.empty() : !B_TYPE
+  %empty_3 = tensor.empty() : !D_TYPE
+  %cst = arith.constant 0.0 : f32
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : !A_TYPE) -> !A_TYPE
+  %fill_2 = linalg.fill ins(%cst : f32) outs(%empty_2 : !B_TYPE) -> !B_TYPE
+  %fill_3 = linalg.fill ins(%cst : f32) outs(%empty_3 : !D_TYPE) -> !D_TYPE
+  %2 = linalg.matmul ins(%lhs, %rhs : !A_TYPE, !A_TYPE)
+      outs(%fill : !A_TYPE) -> !A_TYPE
+  %3 = linalg.matmul ins(%2, %rhs_2 : !A_TYPE, !B_TYPE)
+      outs(%fill_2 : !B_TYPE) -> !B_TYPE
+  %4 = linalg.matmul ins(%lhs_2, %3 : !C_TYPE, !B_TYPE)
+      outs(%fill_3 : !D_TYPE) -> !D_TYPE
+  return %4 : !D_TYPE
+}
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
index 7eff26b59..5a71a2521 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp
@@ -253,7 +253,6 @@ LogicalResult AIETargetBackend::serializeExecutable(
 
   SmallString<128> aie2xclbin(options.mlirAieInstallDir);
   llvm::sys::path::append(aie2xclbin, "bin", "aie2xclbin");
-  std::vector<uint32_t> npuInstrs;
   std::unique_ptr<llvm::MemoryBuffer> xclbinIn;
 
   FlatbufferBuilder builder;
@@ -359,6 +358,8 @@ LogicalResult AIETargetBackend::serializeExecutable(
 
     std::ifstream instrFile(static_cast<std::string>(npuInstPath));
     std::string line;
+    // Vector to store LX6 instructions.
+    std::vector<uint32_t> npuInstrs;
     while (std::getline(instrFile, line)) {
       std::istringstream iss(line);
       uint32_t a;