diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c7f6b2cb2..4e9bbe923 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -103,7 +103,7 @@ jobs:
         run: |
           python3 -m venv .venv
           source .venv/bin/activate
-          pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024060222+6f70bfe-py3-none-manylinux_2_35_x86_64.whl
+          pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024061222+3ac9566-py3-none-manylinux_2_35_x86_64.whl
 
           pip install -r tests/matmul/requirements.txt
 
diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh
index 0765348de..98d451c4a 100755
--- a/build_tools/ci/run_matmul_test.sh
+++ b/build_tools/ci/run_matmul_test.sh
@@ -638,7 +638,7 @@ run_matmul_test \
     --lhs_rhs_type "bf16" \
     --acc_type "f32" \
     --m "64"  --n "64" --k "128" \
-    --expect_compile_failure "1"
+    --num_repeat_runs "0"
 
 run_matmul_test \
     --name_prefix "packPeelLarge" \
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td
index 7d9b7e70f..1ad7d9506 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td
@@ -41,6 +41,18 @@ def LogicalObjectFifoPort: I32EnumAttr<"LogicalObjectFifoPort",
   let cppNamespace = "mlir::iree_compiler::AMDAIE";
 }
 
+def MemoryAccess: I32EnumAttr<"MemoryAccess",
+  "The memory access type",
+  [
+    I32EnumAttrCase<"None", 0>,
+    I32EnumAttrCase<"Read", 1>,
+    I32EnumAttrCase<"Write", 2>,
+    I32EnumAttrCase<"Any", 3>,
+  ]
+  > {
+  let cppNamespace = "mlir::iree_compiler::AMDAIE";
+}
+
 def AMDAIE_MemSpace_Global : I32EnumAttrCase<"Global", 0>;
 def AMDAIE_MemSpace_Shared : I32EnumAttrCase<"Shared", 1>;
 def AMDAIE_MemSpace_Local : I32EnumAttrCase<"Local", 2>;
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
index e4c975e68..b56bbc31d 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp
@@ -328,6 +328,22 @@ LogicalObjectFifoFromMemrefOp CircularDmaCpyNdOp::getTargetObjectFifo() {
   return dyn_cast<LogicalObjectFifoFromMemrefOp>(getTarget().getDefiningOp());
 };
 
+//===----------------------------------------------------------------------===//
+// AMDAIE_LogicalObjectFifoAccessOp
+//===----------------------------------------------------------------------===//
+
+void LogicalObjectFifoAccessOp::build(OpBuilder &b,
+                                      mlir::OperationState &result, Value input,
+                                      MemoryAccess accessType) {
+  auto type = llvm::cast<LogicalObjectFifoType>(input.getType());
+  build(b, result, type.getElementType(), input, accessType);
+}
+
+LogicalObjectFifoFromMemrefOp
+LogicalObjectFifoAccessOp::getLogicalObjectFifo() {
+  return dyn_cast<LogicalObjectFifoFromMemrefOp>(getInput().getDefiningOp());
+};
+
 //===----------------------------------------------------------------------===//
 // AMDAIE_LogicalObjectFifoAcquire
 //===----------------------------------------------------------------------===//
@@ -341,6 +357,26 @@ void LogicalObjectFifoAcquire::build(OpBuilder &b, mlir::OperationState &result,
 // AMDAIE_LogicalObjectFifoFromMemrefOp
 //===----------------------------------------------------------------------===//
 
+/// Build with an array of static tile locations.
+void LogicalObjectFifoFromMemrefOp::build(
+    OpBuilder &b, mlir::OperationState &result, Value memref,
+    ArrayRef<std::pair<int64_t, int64_t>> tileLocations) {
+  SmallVector<Value> tiles;
+  tiles.reserve(tileLocations.size());
+  for (auto [column, row] : tileLocations) {
+    auto colIndex = b.create<arith::ConstantIndexOp>(b.getUnknownLoc(), column);
+    auto rowIndex = b.create<arith::ConstantIndexOp>(b.getUnknownLoc(), row);
+    auto tileOp =
+        b.create<AMDAIE::TileOp>(b.getUnknownLoc(), colIndex, rowIndex);
+    tiles.push_back(tileOp.getResult());
+  }
+  // For deterministic order.
+  llvm::sort(tiles.begin(), tiles.end(),
+             TileOp::tileValueColumnAndRowComparator);
+  auto type = LogicalObjectFifoType::get(cast<MemRefType>(memref.getType()));
+  build(b, result, type, memref, tiles);
+}
+
 LogicalResult LogicalObjectFifoFromMemrefOp::canonicalize(
     LogicalObjectFifoFromMemrefOp logicalObjectFifo,
     PatternRewriter &rewriter) {
@@ -349,23 +385,19 @@ LogicalResult LogicalObjectFifoFromMemrefOp::canonicalize(
     return success();
   }
 
-  auto comparator = [](Value a, Value b) -> bool {
-    TileOp tileA = dyn_cast<TileOp>(a.getDefiningOp());
-    TileOp tileB = dyn_cast<TileOp>(b.getDefiningOp());
-    int64_t colA = getConstantIntValue(tileA.getCol()).value();
-    int64_t rowA = getConstantIntValue(tileA.getRow()).value();
-    int64_t colB = getConstantIntValue(tileB.getCol()).value();
-    int64_t rowB = getConstantIntValue(tileB.getRow()).value();
-    if (colA == colB) return rowA < rowB;
-    return colA < colB;
-  };
   SmallVector<Value> tiles = logicalObjectFifo.getTiles();
-  if (llvm::is_sorted(tiles, comparator)) {
+  if (llvm::is_sorted(tiles, TileOp::tileValueColumnAndRowComparator)) {
+    // Still erase duplicates.
+    tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end());
     return success();
   }
 
-  // If tiles are not sorted, sort them and replace the logical objectfifo
-  llvm::sort(tiles.begin(), tiles.end(), comparator);
+  // If tiles are not sorted, sort them, erase duplicates and replace the
+  // logical objectfifo.
+  llvm::sort(tiles.begin(), tiles.end(),
+             TileOp::tileValueColumnAndRowComparator);
+  tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end());
+
   rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
       logicalObjectFifo,
       llvm::cast<LogicalObjectFifoType>(
@@ -532,6 +564,23 @@ bool TileOp::hasStaticLocation() {
   return getConstantIntValue(getCol()) && getConstantIntValue(getRow());
 }
 
+bool TileOp::tileColumnComparator(AMDAIE::TileOp &a, AMDAIE::TileOp &b) {
+  int64_t colA = getConstantIntValue(a.getCol()).value();
+  int64_t colB = getConstantIntValue(b.getCol()).value();
+  return colA < colB;
+}
+
+bool TileOp::tileValueColumnAndRowComparator(Value a, Value b) {
+  TileOp tileA = dyn_cast<AMDAIE::TileOp>(a.getDefiningOp());
+  TileOp tileB = dyn_cast<AMDAIE::TileOp>(b.getDefiningOp());
+  int64_t colA = getConstantIntValue(tileA.getCol()).value();
+  int64_t rowA = getConstantIntValue(tileA.getRow()).value();
+  int64_t colB = getConstantIntValue(tileB.getCol()).value();
+  int64_t rowB = getConstantIntValue(tileB.getRow()).value();
+  if (colA == colB) return rowA < rowB;
+  return colA < colB;
+};
+
 //===----------------------------------------------------------------------===//
 // AMDAIE_WorkgroupOp
 //===----------------------------------------------------------------------===//
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
index b565212a0..083b94db6 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td
@@ -110,6 +110,11 @@ def AMDAIE_TileOp: AMDAIE_Op<"tile", [
 
   let extraClassDeclaration = [{
     bool hasStaticLocation();
+    // Comparator for `amdaie.tile` based on column index.
+    static bool tileColumnComparator(AMDAIE::TileOp &a, AMDAIE::TileOp &b);
+    // Comparator for `amdaie.tile` values based on column index first and then
+    // row index.
+    static bool tileValueColumnAndRowComparator(Value a, Value b);
   }];
 }
 
@@ -319,6 +324,53 @@ def AMDAIE_NpuDmaWaitOp: AMDAIE_Op<"npu.dma_wait", []> {
 // IREE AMDAIE LogicalObjectFifo Ops
 //===----------------------------------------------------------------------===//
 
+def AMDAIE_LogicalObjectFifoAccessOp : AMDAIE_Op<"logicalobjectfifo.access"> {
+  let summary = "Operation to access the encapsulated memref from a logical"
+                "objectFifo.";
+  let description = [{
+    Returns the encapsulated memref from a logical objectFifo. This is meant to
+    be used within `amdaie.core` operations to access and operate on the memref.
+    Has a memory `access_type` argument that indicates the type of access being
+    done. This can be used to generate a correct (semaphore) synchronization
+    scheme to access the logical objectFifo's content.
+
+    Example:
+    ```mlir
+      %tile = amdaie.tile(%c1, %c3)
+      %alloc = memref.alloc() : memref<8x16xi32, 2>
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<8x16xi32, 2>
+        -> !amdaie.logicalobjectfifo<memref<8x16xi32, 2>>
+      %core = amdaie.core(%tile) {
+        %1 = amdaie.logicalobjectfifo.access(%0, Read) : 
+          !amdaie.logicalobjectfifo<memref<8x16xi32, 2>> ->  memref<8x16xi32, 2>
+    ```
+  }];
+
+  let arguments = (
+    ins AnyAMDAIELogicalObjectFifoType:$input,
+        MemoryAccess:$access_type
+  ); 
+
+  let results = (outs AnyMemRef:$output);
+
+  let assemblyFormat = [{
+    `(` $input `,` $access_type `)`  attr-dict `:` type($input) `->` type($output)
+  }];
+
+  let builders = [
+    // Build a LogicalObjectFifoAccessOp with a logicalObjectFifo value and access
+    // type.
+    OpBuilder<(ins "mlir::Value":$input, "MemoryAccess":$access_type)>
+  ];
+
+  let extraClassDeclaration = [{
+    LogicalObjectFifoFromMemrefOp getLogicalObjectFifo();
+  }];
+
+  // let hasVerifier = 1;
+  let cppNamespace = "mlir::iree_compiler::AMDAIE";
+}
+
 def AMDAIE_LogicalObjectFifoAcquire: 
     AMDAIE_Op<"logicalobjectfifo.acquire", []> {
   let summary = "Semaphore operation to acquire objects from a logical"
@@ -430,7 +482,12 @@ def AMDAIE_LogicalObjectFifoFromMemrefOp
 
   // Build a LogicalObjectFifoFromMemrefOp with just a memref value.
   let builders = [
-    OpBuilder<(ins "mlir::Value":$memref)>
+    OpBuilder<(ins "mlir::Value":$memref)>,
+    // Build `LogicalObjectFifoFromMemrefOp` with an array of static tile
+    // locations.
+    OpBuilder<
+      (ins "mlir::Value":$memref, 
+           "::llvm::ArrayRef<std::pair<int64_t, int64_t>>":$tileLocations)>
   ];
 
   let extraClassDeclaration = [{
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
index bedaf8c08..670e303a2 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir
@@ -116,6 +116,15 @@ func.func @dma_cpy_nd_mixed(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32
 
 // -----
 
+// CHECK-LABEL: func.func @logicalobjectfifo_access
+// CHECK:       amdaie.logicalobjectfifo.access
+func.func @logicalobjectfifo_access(%arg0: !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>>) {
+  %0 = amdaie.logicalobjectfifo.access(%arg0, Write) : !amdaie.logicalobjectfifo<memref<1x1x8x16xi32, 2>> -> memref<1x1x8x16xi32, 2 : i32>
+  return
+}
+
+// -----
+
 // CHECK-LABEL: func.func @logicalobjectfifo_acquire
 // CHECK:       %[[DMA:.+]] = amdaie.dma_cpy_nd
 // CHECK:       amdaie.logicalobjectfifo.acquire
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
index 170e9ea30..84c0da3e8 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp
@@ -9,6 +9,7 @@
 #include "iree-amd-aie/Transforms/Passes.h"
 #include "iree-amd-aie/Transforms/Transforms.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
+#include "mlir/Dialect/Linalg/IR/Linalg.h"
 #include "mlir/Dialect/SCF/Transforms/Transforms.h"
 #include "mlir/IR/Iterators.h"
 #include "mlir/Pass/Pass.h"
@@ -24,6 +25,144 @@ static const llvm::StringLiteral kAMDAIELoopUnroll = "amdaie.unroll";
 
 namespace {
 
+//===----------------------------------------------------------------------===//
+// Utilities
+//===----------------------------------------------------------------------===//
+
+/// Comparator for a pair of `amdaie.dma_cpy_nd` on the first tile operation's
+/// column index.
+bool dmaColComparator(
+    std::pair<AMDAIE::DmaCpyNdOp, SmallVector<AMDAIE::TileOp>> &a,
+    std::pair<AMDAIE::DmaCpyNdOp, SmallVector<AMDAIE::TileOp>> &b) {
+  return TileOp::tileColumnComparator(a.second[0], b.second[0]);
+};
+
+/// Utility to use tuple coordinates as key of a `DenseMap`.
+struct LocationMapInfo {
+  static SmallVector<std::pair<int64_t, int64_t>> getEmptyKey() {
+    return {std::make_pair(int64_t(-1), int64_t(-1))};
+  }
+
+  static SmallVector<std::pair<int64_t, int64_t>> getTombstoneKey() {
+    return {std::make_pair(int64_t(-2), int64_t(-2))};
+  }
+
+  static unsigned getHashValue(
+      const SmallVector<std::pair<int64_t, int64_t>> &v) {
+    return static_cast<unsigned>(llvm::hash_combine_range(v.begin(), v.end()));
+  }
+
+  static bool isEqual(const SmallVector<std::pair<int64_t, int64_t>> &lhs,
+                      const SmallVector<std::pair<int64_t, int64_t>> &rhs) {
+    return lhs == rhs;
+  }
+};
+
+//===----------------------------------------------------------------------===//
+// AMDAIEDistributeCoresAndObjectFifosPass
+//===----------------------------------------------------------------------===//
+
+/// Distribute local memory accesses through subviews by allocating a single
+/// smaller memory. This is needed because cores can't operate on one larger L1
+/// memory.
+LogicalResult distributeLocalMemory(ModuleOp moduleOp) {
+  IRRewriter rewriter(moduleOp.getContext());
+  SmallVector<Operation *> toBeErased;
+  // Map from alloc operations to a new alloc operations to be used.
+  DenseMap<memref::AllocOp, memref::AllocOp> memrefToNew;
+
+  moduleOp->walk([&](memref::AllocOp allocOp) {
+    // Only consider local memory (L1).
+    Attribute memSpace =
+        cast<MemRefType>(allocOp.getResult().getType()).getMemorySpace();
+    if (!memSpace || dyn_cast<IntegerAttr>(memSpace).getInt() != 2)
+      return WalkResult::advance();
+
+    LLVM_DEBUG(llvm::dbgs()
+               << "DistributeLocalMemory for: " << allocOp << "\n");
+
+    SmallVector<AMDAIE::DmaCpyNdOp> dmaUsers;
+    for (Operation *userOp : allocOp->getUsers()) {
+      if (auto logicalObjectFifo =
+              dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(userOp)) {
+        for (Operation *objFifoUserOp : logicalObjectFifo->getUsers()) {
+          if (auto dmaOp = dyn_cast<AMDAIE::DmaCpyNdOp>(objFifoUserOp);
+              dmaOp.getSourceObjectFifo() == logicalObjectFifo) {
+            dmaUsers.push_back(dmaOp);
+          }
+        }
+      }
+    }
+    if (dmaUsers.empty()) return WalkResult::advance();
+    LLVM_DEBUG(llvm::dbgs() << "DMA users: " << dmaUsers.size() << "\n");
+
+    for (Operation *userOp : allocOp->getUsers()) {
+      auto subviewOp = dyn_cast<memref::SubViewOp>(userOp);
+      if (!subviewOp) continue;
+
+      if (!memrefToNew.contains(allocOp)) {
+        LLVM_DEBUG(llvm::dbgs() << "Create new allocate\n");
+        rewriter.setInsertionPoint(allocOp);
+        auto memRefType = cast<MemRefType>(subviewOp.getResult().getType());
+        MemRefType allocType = MemRefType::get(
+            memRefType.getShape(), memRefType.getElementType(),
+            MemRefLayoutAttrInterface{}, memRefType.getMemorySpace());
+        auto newAllocOp = rewriter.create<memref::AllocOp>(
+            rewriter.getUnknownLoc(), allocType);
+        auto newDeallocOp = rewriter.create<memref::DeallocOp>(
+            rewriter.getUnknownLoc(), newAllocOp);
+        newDeallocOp->moveBefore(&newAllocOp->getBlock()->back());
+        memrefToNew[allocOp] = newAllocOp;
+      }
+      auto newAlloc = memrefToNew[allocOp];
+      rewriter.replaceAllUsesWith(subviewOp, newAlloc);
+      toBeErased.push_back(subviewOp);
+    }
+
+    // Update the alloc's DMA users.
+    if (memrefToNew.contains(allocOp)) {
+      LLVM_DEBUG(llvm::dbgs()
+                 << "Update allocate DMA users: " << dmaUsers.size() << "\n");
+      auto newAlloc = memrefToNew[allocOp];
+      auto type = cast<MemRefType>(newAlloc.getType());
+      for (AMDAIE::DmaCpyNdOp dmaOp : dmaUsers) {
+        SmallVector<Value> empty;
+        rewriter.setInsertionPoint(dmaOp.getSourceObjectFifo());
+        auto source = rewriter.create<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+            rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type),
+            newAlloc.getResult());
+        rewriter.replaceOp(dmaOp.getSourceObjectFifo(), source);
+        rewriter.setInsertionPoint(dmaOp);
+        auto newDmaOp = rewriter.create<AMDAIE::DmaCpyNdOp>(
+            dmaOp.getLoc(), dmaOp.getTarget(), dmaOp.getTargetOffsets(),
+            dmaOp.getTargetSizes(), dmaOp.getTargetStrides(), source,
+            dmaOp.getSourceOffsets(), dmaOp.getSourceSizes(),
+            dmaOp.getSourceStrides());
+        rewriter.replaceOp(dmaOp, newDmaOp);
+      }
+
+      // Insert dealloc
+      memref::DeallocOp deallocOp;
+      for (Operation *userOp : allocOp->getUsers()) {
+        if (auto deallocUser = dyn_cast<memref::DeallocOp>(userOp)) {
+          deallocOp = deallocUser;
+        }
+      }
+      if (deallocOp) {
+        toBeErased.push_back(deallocOp);
+      }
+      toBeErased.push_back(allocOp);
+    }
+    return WalkResult::advance();
+  });
+
+  for (auto *op : toBeErased) {
+    op->dropAllUses();
+    rewriter.eraseOp(op);
+  }
+  return success();
+}
+
 /// Convert inner scf.forall ops chosen for parallel distribution to scf.for
 /// ops.
 LogicalResult localForallToFor(ModuleOp moduleOp) {
@@ -192,39 +331,44 @@ class AMDAIEUnrollLocalLoops : public OpRewritePattern<scf::ForOp> {
         rewriter.create<arith::ConstantIndexOp>(rewriter.getUnknownLoc(), 1));
 
     // Iterate through the loop and create body
-    IRMapping operandMap;
     for (auto i = lbInt + stepInt; i < ubInt; i += stepInt) {
+      IRMapping operandMap;
       Value ivUnroll =
           builder.create<arith::ConstantIndexOp>(builder.getUnknownLoc(), i);
       if (!forOpIV.use_empty()) {
         operandMap.map(forOpIV, ivUnroll);
       }
 
-      // Iterate through body and clone ops
+      // Iterate through body and map internal logical objectfifos to new ones
+      // and fill operand map.
       for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd);
            it++) {
         if (auto dmaOp = dyn_cast<AMDAIE::DmaCpyNdOp>(*it)) {
           AMDAIE::LogicalObjectFifoFromMemrefOp source =
               dmaOp.getSourceObjectFifo();
+          uint64_t sourceMemSpaceInt = source.getMemorySpaceAsUInt();
           AMDAIE::LogicalObjectFifoFromMemrefOp target =
               dmaOp.getTargetObjectFifo();
-          if (!operandMap.contains(source.getOutput())) {
-            rewriter.setInsertionPoint(source);
-            auto cloneOp = dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-                rewriter.clone(*dmaOp.getSource().getDefiningOp()));
-            operandMap.map(source.getOutput(), cloneOp.getOutput());
-          }
-          if (!operandMap.contains(target.getOutput())) {
+          uint64_t targetMemSpaceInt = target.getMemorySpaceAsUInt();
+          if (targetMemSpaceInt > sourceMemSpaceInt) {
             rewriter.setInsertionPoint(target);
             auto cloneOp = dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(
                 rewriter.clone(*dmaOp.getTarget().getDefiningOp()));
             operandMap.map(target.getOutput(), cloneOp.getOutput());
+          } else if (sourceMemSpaceInt > targetMemSpaceInt) {
+            rewriter.setInsertionPoint(source);
+            auto cloneOp = dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+                rewriter.clone(*dmaOp.getSource().getDefiningOp()));
+            operandMap.map(source.getOutput(), cloneOp.getOutput());
           }
-          builder.clone(*it, operandMap);
-        } else {
-          builder.clone(*it, operandMap);
         }
       }
+
+      // Iterate through body and clone ops
+      for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd);
+           it++) {
+        builder.clone(*it, operandMap);
+      }
     }
     return success();
   }
@@ -265,65 +409,6 @@ class AMDAIEUnrollLocalLoops : public OpRewritePattern<scf::ForOp> {
   }
 };
 
-/// Assign tiles to the logical objectfifos with local memory space (L1).
-/// The tiles are derived from the usage of the logical objectfifos within
-/// core operations, which are already assigned a tile location.
-LogicalResult assignLocalAieTiles(ModuleOp moduleOp) {
-  IRRewriter rewriter(moduleOp.getContext());
-
-  // Map from local objectfifos found to the tiles where they are used
-  DenseMap<AMDAIE::LogicalObjectFifoFromMemrefOp,
-           llvm::SmallSetVector<Value, 16>>
-      logicalObjectFifosToTiles;
-
-  // Utility function insert a local objectfifo - tile pair into the local
-  // objectfifo to tile map
-  auto insertTile = [&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo,
-                        Value tileResult) -> void {
-    if (!logicalObjectFifosToTiles.contains(logicalObjectFifo)) {
-      logicalObjectFifosToTiles[logicalObjectFifo] = {};
-    }
-    logicalObjectFifosToTiles[logicalObjectFifo].insert(tileResult);
-  };
-
-  // Walk DMA ops and find the ones which are used in cores to update
-  // source/target logical objectfifos
-  moduleOp->walk([&](AMDAIE::DmaCpyNdOp dmaOp) {
-    for (Operation *userOp : dmaOp->getUsers()) {
-      if (auto coreOp = userOp->getParentOfType<AMDAIE::CoreOp>()) {
-        Attribute sourceMemspace = dmaOp.getSourceObjectFifo().getMemorySpace();
-        Attribute targetMemspace = dmaOp.getTargetObjectFifo().getMemorySpace();
-        if (sourceMemspace &&
-            dyn_cast<IntegerAttr>(sourceMemspace).getInt() == 2) {
-          // Source on L1
-          insertTile(dmaOp.getSourceObjectFifo(),
-                     coreOp.getTileOp().getResult());
-        } else if (targetMemspace &&
-                   dyn_cast<IntegerAttr>(targetMemspace).getInt() == 2) {
-          // Target on L1
-          insertTile(dmaOp.getTargetObjectFifo(),
-                     coreOp.getTileOp().getResult());
-        }
-
-        // Move tile to beginning of parent block.
-        rewriter.moveOpBefore(coreOp.getTileOp(), coreOp->getBlock(),
-                              coreOp->getBlock()->begin());
-      }
-    }
-    return WalkResult::advance();
-  });
-
-  // Update the logical objectfifos with assigned tiles
-  for (auto &&[logicalObjectFifo, tiles] : logicalObjectFifosToTiles) {
-    rewriter.setInsertionPoint(logicalObjectFifo);
-    rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
-        logicalObjectFifo,
-        cast<LogicalObjectFifoType>(logicalObjectFifo.getOutput().getType()),
-        logicalObjectFifo.getMemref(), tiles.takeVector());
-  }
-  return success();
-}
-
 /// Return the tiles of the sources respectively targets of the users of this
 /// logical objectfifo, depending on whether the OperateOn template parameter is
 /// set to `OperateOn::Source` respectively `OperateOn::Target`.
@@ -353,19 +438,172 @@ LogicalResult getUserTiles(
   return success();
 }
 
-/// Assign logical objectfifos to physical AIE tiles. This rewrite takes an
-/// iterative approach by matching logical objectfifos and only assigning tiles
-/// when linked through dma ops with other logical objectfifos which already
-/// have tiles assigned. If the linked logical objectfifos don't have tiles
-/// assigned yet, we will return a failure and give the linked logical
-/// objectfifos a chance to assign tiles before returning to this one.
+/// Insert `amdaie.logicalobjectfifo.access` operations which retrieve the
+/// memrefs from logical objectfifos and update the computational operations to
+/// operate on these local memrefs.
+LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) {
+  IRRewriter rewriter(moduleOp.getContext());
+
+  SmallVector<AMDAIE::CoreOp> coreOps;
+  moduleOp->walk([&](AMDAIE::CoreOp coreOp) { coreOps.push_back(coreOp); });
+
+  for (AMDAIE::CoreOp coreOp : coreOps) {
+    DenseMap<Value, std::tuple<AMDAIE::LogicalObjectFifoFromMemrefOp,
+                               AMDAIE::MemoryAccess>>
+        memrefToLogicalObjectFifo;
+    // First walk to collect consume/produce DMA accesses and map respective
+    // memrefs to logical objectifos.
+    coreOp->walk([&](Operation *op) {
+      // TODO(jornt): can we avoid produce/consume?
+      if (auto consumeOp = dyn_cast<AMDAIE::LogicalObjectFifoConsume>(op)) {
+        Value targetMemref =
+            consumeOp.getDmaCpyNdOp().getTargetObjectFifo().getMemref();
+        memrefToLogicalObjectFifo[targetMemref] =
+            std::make_pair(consumeOp.getDmaCpyNdOp().getTargetObjectFifo(),
+                           AMDAIE::MemoryAccess::Read);
+      } else if (auto produceOp =
+                     dyn_cast<AMDAIE::LogicalObjectFifoProduce>(op)) {
+        Value sourceMemref =
+            produceOp.getDmaCpyNdOp().getSourceObjectFifo().getMemref();
+        memrefToLogicalObjectFifo[sourceMemref] =
+            std::make_pair(produceOp.getDmaCpyNdOp().getSourceObjectFifo(),
+                           AMDAIE::MemoryAccess::Write);
+      }
+    });
+
+    WalkResult res = coreOp->walk([&](Operation *op) {
+      if (auto linalgOp = dyn_cast<linalg::LinalgOp>(op)) {
+        for (auto &&[idx, operand] :
+             llvm::enumerate(linalgOp->getOpOperands())) {
+          if (memrefToLogicalObjectFifo.contains(operand.get())) {
+            rewriter.setInsertionPointToStart(coreOp.getBody());
+            std::tuple<AMDAIE::LogicalObjectFifoFromMemrefOp,
+                       AMDAIE::MemoryAccess>
+                value = memrefToLogicalObjectFifo[operand.get()];
+            rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+                rewriter.getUnknownLoc(), std::get<0>(value),
+                std::get<1>(value));
+            // TODO(jornt): Temporary, enable after access operations are used
+            // for inserting synchronization stubs instead of consume/produce.
+            // linalgOp->setOperand(idx, accessOp);
+          } else if (auto type =
+                         llvm::dyn_cast<MemRefType>(operand.get().getType())) {
+            Value memref = operand.get();
+            rewriter.setInsertionPoint(coreOp);
+            auto logicalObjectFifo =
+                rewriter.create<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+                    rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type),
+                    memref);
+            rewriter.setInsertionPointToStart(coreOp.getBody());
+            rewriter.create<AMDAIE::LogicalObjectFifoAccessOp>(
+                rewriter.getUnknownLoc(), logicalObjectFifo,
+                AMDAIE::MemoryAccess::None);
+            // TODO(jornt): Temporary, enable after access operations are used
+            // for inserting synchronization stubs instead of consume/produce.
+            // linalgOp->setOperand(idx, accessOp);
+          }
+        }
+      }
+      return WalkResult::advance();
+    });
+    if (res.wasInterrupted()) return failure();
+  }
+  return success();
+}
+
+/// Utility to recursively find users of the provided logical objectFifo inside
+/// `amdaie.core` operations and return the tile coordinates.
+LogicalResult findUsersInCoreAndAddTiles(
+    Operation *op, AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo,
+    llvm::SmallSetVector<std::pair<int64_t, int64_t>, 16> &tiles) {
+  for (Operation *userOp : op->getUsers()) {
+    if (auto coreOp = userOp->getParentOfType<AMDAIE::CoreOp>()) {
+      AMDAIE::TileOp tileOp = coreOp.getTileOp();
+      std::optional<int64_t> column = getConstantIntValue(tileOp.getCol());
+      std::optional<int64_t> row = getConstantIntValue(tileOp.getRow());
+      if (!column || !row) {
+        return coreOp.emitOpError() << "has non-constant tile location";
+      }
+      tiles.insert(std::make_pair(column.value(), row.value()));
+    }
+    if (auto subviewOp = dyn_cast<memref::SubViewOp>(userOp)) {
+      return findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, tiles);
+    } else if (auto userLogicalObjectFifo =
+                   dyn_cast<AMDAIE::LogicalObjectFifoFromMemrefOp>(userOp)) {
+      return findUsersInCoreAndAddTiles(userLogicalObjectFifo,
+                                        logicalObjectFifo, tiles);
+    }
+  }
+  return success();
+}
+
+/// Assign tiles to the logical objectfifos with local memory space (L1).
+/// The tiles are derived from the usage of the logical objectfifos within
+/// core operations, which are already assigned a tile location.
+LogicalResult assignLocalAieTiles(ModuleOp moduleOp) {
+  IRRewriter rewriter(moduleOp.getContext());
+
+  WalkResult res = moduleOp->walk(
+      [&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) {
+        Attribute memSpace = logicalObjectFifo.getMemorySpace();
+        if (!memSpace || dyn_cast<IntegerAttr>(memSpace).getInt() != 2)
+          return WalkResult::advance();
+
+        llvm::SmallSetVector<std::pair<int64_t, int64_t>, 16> tileLocations;
+        if (failed(findUsersInCoreAndAddTiles(
+                logicalObjectFifo, logicalObjectFifo, tileLocations))) {
+          return WalkResult::interrupt();
+        }
+        // Handle subviews.
+        for (Operation *userOp :
+             logicalObjectFifo.getMemref().getDefiningOp()->getUsers()) {
+          if (auto subviewOp = dyn_cast<memref::SubViewOp>(userOp)) {
+            if (failed(findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo,
+                                                  tileLocations))) {
+              return WalkResult::interrupt();
+            }
+          }
+        }
+
+        SmallVector<Value> tiles;
+        tiles.reserve(tileLocations.size());
+        rewriter.setInsertionPoint(logicalObjectFifo);
+        for (auto [column, row] : tileLocations) {
+          auto colIndex = rewriter.create<arith::ConstantIndexOp>(
+              rewriter.getUnknownLoc(), column);
+          auto rowIndex = rewriter.create<arith::ConstantIndexOp>(
+              rewriter.getUnknownLoc(), row);
+          auto tileOp = rewriter.create<AMDAIE::TileOp>(
+              rewriter.getUnknownLoc(), colIndex, rowIndex);
+          tiles.push_back(tileOp.getResult());
+        }
+        // Sort for deterministic output IR.
+        llvm::sort(tiles.begin(), tiles.end(),
+                   AMDAIE::TileOp::tileValueColumnAndRowComparator);
+        rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+            logicalObjectFifo,
+            cast<LogicalObjectFifoType>(
+                logicalObjectFifo.getOutput().getType()),
+            logicalObjectFifo.getMemref(), tiles);
+        return WalkResult::advance();
+      });
+  if (res.wasInterrupted()) return failure();
+  return success();
+}
+
+/// Assign a set of potential physical AIE tiles to logical objectFifos. This
+/// rewrite takes an iterative approach by matching logical objectfifos and only
+/// assigning tiles when linked through dma ops with other logical objectfifos
+/// which already have tiles assigned. If the linked logical objectfifos don't
+/// have tiles assigned yet, we will return a failure and give the linked
+/// logical objectfifos a chance to assign tiles before returning to this one.
 ///
-/// TODO(jornt): There are decisions being made in this pass on which tile to
+/// TODO(jornt): There are decisions being made in this pass on which tiles to
 /// assign to a logical objectfifo. This logic is very simple for now and tries
-/// to use the leftmost available column. At some point, we probably need some
-/// AIE device model to guide the assignement here for performance and to avoid
-/// resource issues down below.
-class AssignAieTiles
+/// to use the tiles in the same columns as targets and sources. At some point,
+/// we probably need some AIE device model to guide the assignement here for
+/// performance and to avoid hardware resource issues later on.
+class FillAieTiles
     : public OpRewritePattern<AMDAIE::LogicalObjectFifoFromMemrefOp> {
   using OpRewritePattern<
       AMDAIE::LogicalObjectFifoFromMemrefOp>::OpRewritePattern;
@@ -373,6 +611,7 @@ class AssignAieTiles
   LogicalResult matchAndRewrite(
       AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo,
       PatternRewriter &rewriter) const override {
+    LLVM_DEBUG(llvm::dbgs() << "FillAieTiles: " << logicalObjectFifo << "\n");
     if (!logicalObjectFifo.getTiles().empty()) {
       return failure();
     }
@@ -388,88 +627,246 @@ class AssignAieTiles
       }
       return failure();
     }
+    // HandLe both L3/shim and L2/Memtiles.
+    // Skip logical objectfifos within non-global and non-shared memory.
+    if (memSpace && dyn_cast<IntegerAttr>(memSpace).getInt() != 1) {
+      return logicalObjectFifo.emitOpError()
+             << "found logical objectfifo with unknown memory space";
+    }
+
+    SmallVector<AMDAIE::TileOp, 16> targetTiles;
+    SmallVector<AMDAIE::TileOp, 16> sourceTiles;
+    LogicalResult dstRes =
+        getUserTiles<CopyOpOperateOn::Target>(logicalObjectFifo, targetTiles);
+    LogicalResult srcRes =
+        getUserTiles<CopyOpOperateOn::Source>(logicalObjectFifo, sourceTiles);
+
+    // If no source and target tiles found, skip.
+    if (failed(dstRes) && failed(srcRes)) {
+      return failure();
+    }
 
-    SmallVector<Value> tileResults;
-    if (!memSpace || dyn_cast<IntegerAttr>(memSpace).getInt() == 1) {
-      // HandLe both L3/shim and L2/Memtiles. Try to use memtiles in the same
-      // column as the AIE tiles where the data needs to go to.
-
-      // TODO(jornt): avoid row hardcoding. Will need to update the mlir-aie
-      // target model for this.
-      int rowInt = memSpace ? 1 : 0;
-      Value row = rewriter.create<arith::ConstantIndexOp>(
-          rewriter.getUnknownLoc(), rowInt);
-
-      SmallVector<AMDAIE::TileOp, 16> targetTiles;
-      SmallVector<AMDAIE::TileOp, 16> sourceTiles;
-      LogicalResult dstRes =
-          getUserTiles<CopyOpOperateOn::Target>(logicalObjectFifo, targetTiles);
-      LogicalResult srcRes =
-          getUserTiles<CopyOpOperateOn::Source>(logicalObjectFifo, sourceTiles);
-
-      // If no source and target tiles found, skip.
-      if (failed(dstRes) && failed(srcRes)) {
-        return failure();
+    // TODO(jornt): avoid row hardcoding. Will need to update the mlir-aie
+    // target model for this.
+    int64_t rowInt = memSpace ? 1 : 0;
+    llvm::SmallSetVector<std::pair<int64_t, int64_t>, 16> tileLocations;
+    auto createTileLocations =
+        [&](SmallVector<AMDAIE::TileOp, 16> &tiles) -> LogicalResult {
+      // TODO(jornt): For now, for deterministic behaviour, sort on column
+      // index and use first one. This needs to be generalized to assign
+      // tiles based on a resource model.
+      std::sort(tiles.begin(), tiles.end(),
+                AMDAIE::TileOp::tileColumnComparator);
+      // Erase duplicates.
+      tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end());
+      for (AMDAIE::TileOp tile : tiles) {
+        std::optional<int64_t> column = getConstantIntValue(tile.getCol());
+        if (!column) return tile.emitOpError() << "found non-constant column";
+        tileLocations.insert(std::make_pair(column.value(), rowInt));
       }
+      return success();
+    };
 
-      auto colComparator = [](AMDAIE::TileOp &a, AMDAIE::TileOp &b) -> bool {
-        int64_t colA = getConstantIntValue(a.getCol()).value();
-        int64_t colB = getConstantIntValue(b.getCol()).value();
-        return colA < colB;
-      };
-      if (!targetTiles.empty()) {
-        // TODO(jornt): For now, for deterministic behaviour, sort on column
-        // index and use first one. This needs to be generalized to assign tiles
-        // based on a resource model.
-        std::sort(targetTiles.begin(), targetTiles.end(), colComparator);
-        Value col = targetTiles[0].getCol();
-        tileResults.push_back(
-            rewriter.create<AMDAIE::TileOp>(rewriter.getUnknownLoc(), col, row)
-                .getResult());
-      } else if (!sourceTiles.empty()) {
-        // TODO(jornt): For now, for deterministic behaviour, sort on column
-        // index and use first one. This needs to be generalized to assign tiles
-        // based on a resource model.
-        std::sort(sourceTiles.begin(), sourceTiles.end(), colComparator);
-        Value col = sourceTiles[0].getCol();
-        tileResults.push_back(
-            rewriter.create<AMDAIE::TileOp>(rewriter.getUnknownLoc(), col, row)
-                .getResult());
-      } else {
-        // Don't assign this logicalObjectFifo to a physical tile (yet!). Wait
-        // for other logical objectfifos to be assigned first.
+    if (!targetTiles.empty() && !sourceTiles.empty()) {
+      return logicalObjectFifo.emitOpError()
+             << "found logical objectfifo with both source and target tiles, "
+                "which is not supported yet";
+    } else if (!targetTiles.empty()) {
+      // Create tile locations for this logical objectfifo based on target
+      // tiles.
+      if (failed(createTileLocations(targetTiles))) {
+        return failure();
+      }
+    } else if (!sourceTiles.empty()) {
+      // Create tile locations for this logical objectfifo based on source
+      // tiles.
+      if (failed(createTileLocations(sourceTiles))) {
         return failure();
       }
     } else {
-      return logicalObjectFifo.emitOpError()
-             << "found logical objectfifo with unknown memory space";
+      // Don't assign this logicalObjectFifo to a physical tile (yet!). Wait
+      // for other logical objectfifos to be assigned first.
+      return failure();
     }
+
     // If no tile results, skip, and maybe in a next iteration another tile will
     // be found.
-    if (tileResults.empty()) {
+    if (tileLocations.empty()) {
       return failure();
     }
 
-    // Extend this logical objectfifo's tile set.
-    SmallVector<Value> objFifoTiles = logicalObjectFifo.getTiles();
-    DenseSet<Value> tileSet(objFifoTiles.begin(), objFifoTiles.end());
+    rewriter.setInsertionPoint(logicalObjectFifo);
+    rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+        logicalObjectFifo, logicalObjectFifo.getMemref(),
+        tileLocations.takeVector());
+    return success();
+  }
+};
 
-    // If the logical objectfifo already contains all the new tiles, skip.
-    if (llvm::all_of(tileResults,
-                     [&](Value val) { return tileSet.contains(val); })) {
-      return failure();
+/// Return the user DMA operations and corresponding assigned tiles in the
+/// specified direction (source or target).
+template <CopyOpOperateOn OperateOn>
+SmallVector<std::pair<AMDAIE::DmaCpyNdOp, SmallVector<AMDAIE::TileOp>>>
+getUserDmasAndTiles(AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) {
+  SmallVector<std::pair<AMDAIE::DmaCpyNdOp, SmallVector<AMDAIE::TileOp>>>
+      dmaOps;
+  for (Operation *user : logicalObjectFifo->getUsers()) {
+    if (auto dmaOp = dyn_cast<AMDAIE::DmaCpyNdOp>(user)) {
+      ValueRange tileIndices;
+      if constexpr (OperateOn == CopyOpOperateOn::Source) {
+        if (dmaOp.getTargetObjectFifo() != logicalObjectFifo) continue;
+        tileIndices = dmaOp.getSourceObjectFifo().getTiles();
+      } else if constexpr (OperateOn == CopyOpOperateOn::Target) {
+        if (dmaOp.getSourceObjectFifo() != logicalObjectFifo) continue;
+        tileIndices = dmaOp.getTargetObjectFifo().getTiles();
+      }
+      SmallVector<AMDAIE::TileOp> tiles;
+      for (Value index : tileIndices)
+        tiles.push_back(dyn_cast<AMDAIE::TileOp>(index.getDefiningOp()));
+      dmaOps.push_back(std::make_pair(dmaOp, tiles));
+    }
+  }
+  return dmaOps;
+}
+
+/// Assign specific tile locations to objectFifos, starting from the set of
+/// potential tile locations filled in earlier.
+LogicalResult assignAieTilesAndDistributeLogicalObjectFifos(ModuleOp moduleOp) {
+  IRRewriter rewriter(moduleOp.getContext());
+
+  moduleOp->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) {
+    Attribute memSpace = logicalObjectFifo.getMemorySpace();
+    if (memSpace && dyn_cast<IntegerAttr>(memSpace).getInt() != 1)
+      return WalkResult::advance();
+
+    SmallVector<AMDAIE::TileOp> tiles = llvm::map_to_vector(
+        logicalObjectFifo.getTiles(),
+        [](Value tile) { return dyn_cast<TileOp>(tile.getDefiningOp()); });
+    llvm::sort(tiles.begin(), tiles.end(),
+               AMDAIE::TileOp::tileColumnComparator);
+    SmallVector<std::pair<AMDAIE::DmaCpyNdOp, SmallVector<AMDAIE::TileOp>>>
+        sourceDmaOps =
+            getUserDmasAndTiles<CopyOpOperateOn::Source>(logicalObjectFifo);
+    SmallVector<std::pair<AMDAIE::DmaCpyNdOp, SmallVector<AMDAIE::TileOp>>>
+        targetDmaOps =
+            getUserDmasAndTiles<CopyOpOperateOn::Target>(logicalObjectFifo);
+
+    // Assign tiles for following cases:
+    // 1) No source DMA operations (e.g. L3 -> L2): distribute onto multiple
+    // tiles to potentially use multiple shim DMAs for reading from global
+    // memory in different columns.
+    // 2) No target DMA operations (e.g. L2 -> L3):
+    // distribute onto multiple tiles to potentially use multiple shim DMAs for
+    // writing to global memory in different columns.
+    // 3) Default: assign first tile from the sorted sequence of potential
+    // tiles.
+    if (sourceDmaOps.empty() && targetDmaOps.size() == tiles.size()) {
+      llvm::sort(targetDmaOps.begin(), targetDmaOps.end(), dmaColComparator);
+      for (auto &&[tile, dmaOpElem] : llvm::zip(tiles, targetDmaOps)) {
+        rewriter.setInsertionPoint(logicalObjectFifo);
+        SmallVector<Value> tileResults = {cast<Value>(tile.getResult())};
+        auto newLogicalObjectFifo =
+            rewriter.create<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+                rewriter.getUnknownLoc(),
+                cast<LogicalObjectFifoType>(
+                    logicalObjectFifo.getOutput().getType()),
+                logicalObjectFifo.getMemref(), tileResults);
+        dmaOpElem.first->replaceUsesOfWith(logicalObjectFifo.getResult(),
+                                           newLogicalObjectFifo.getResult());
+      }
+    } else if (targetDmaOps.empty() && sourceDmaOps.size() == tiles.size()) {
+      llvm::sort(sourceDmaOps.begin(), sourceDmaOps.end(), dmaColComparator);
+      for (auto &&[tile, dmaOpElem] : llvm::zip(tiles, sourceDmaOps)) {
+        rewriter.setInsertionPoint(logicalObjectFifo);
+        SmallVector<Value> tileResults = {cast<Value>(tile.getResult())};
+        auto newLogicalObjectFifo =
+            rewriter.create<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+                rewriter.getUnknownLoc(),
+                cast<LogicalObjectFifoType>(
+                    logicalObjectFifo.getOutput().getType()),
+                logicalObjectFifo.getMemref(), tileResults);
+        dmaOpElem.first->replaceUsesOfWith(logicalObjectFifo.getResult(),
+                                           newLogicalObjectFifo.getResult());
+      }
+    } else {
+      // For now, use first tile in sorted list. This will need to become more
+      // complex in the future to account for potential hardware limitations and
+      // constraints.
+      SmallVector<Value> tileResults = {cast<Value>(tiles[0].getResult())};
+      rewriter.setInsertionPoint(logicalObjectFifo);
+      rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
+          logicalObjectFifo,
+          cast<LogicalObjectFifoType>(logicalObjectFifo.getOutput().getType()),
+          logicalObjectFifo.getMemref(), tileResults);
     }
+    return WalkResult::advance();
+  });
+  return success();
+}
+
+/// Allocate different memories for logical objectFifos on the same shared
+/// memory tile to ensure different buffers will be used for them.
+LogicalResult distributeSharedMemory(ModuleOp moduleOp) {
+  IRRewriter rewriter(moduleOp.getContext());
+
+  // Map from local objectfifos found to the tiles where they are used
+  DenseMap<SmallVector<std::pair<int64_t, int64_t>>, Value, LocationMapInfo>
+      locationsToMemref;
+
+  moduleOp->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) {
+    Attribute memSpace = logicalObjectFifo.getMemorySpace();
+    if (!memSpace || dyn_cast<IntegerAttr>(memSpace).getInt() != 1)
+      return WalkResult::advance();
 
-    // Concatenate existing with new tiles and replace the logicalObjectFifo
-    std::move(objFifoTiles.begin(), objFifoTiles.end(),
-              std::back_inserter(tileResults));
+    SmallVector<AMDAIE::TileOp> tiles =
+        llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) {
+          return dyn_cast<AMDAIE::TileOp>(tile.getDefiningOp());
+        });
+    llvm::sort(tiles.begin(), tiles.end(),
+               AMDAIE::TileOp::tileValueColumnAndRowComparator);
+
+    SmallVector<AMDAIE::TileOp> targetTiles;
+    (void)getUserTiles<CopyOpOperateOn::Target>(logicalObjectFifo, targetTiles);
+    llvm::sort(targetTiles.begin(), targetTiles.end(),
+               AMDAIE::TileOp::tileValueColumnAndRowComparator);
+    tiles.insert(tiles.end(), std::make_move_iterator(targetTiles.begin()),
+                 std::make_move_iterator(targetTiles.end()));
+
+    SmallVector<AMDAIE::TileOp> sourceTiles;
+    (void)getUserTiles<CopyOpOperateOn::Source>(logicalObjectFifo, sourceTiles);
+    llvm::sort(sourceTiles.begin(), sourceTiles.end(),
+               AMDAIE::TileOp::tileValueColumnAndRowComparator);
+    tiles.insert(tiles.end(), std::make_move_iterator(sourceTiles.begin()),
+                 std::make_move_iterator(sourceTiles.end()));
+    LLVM_DEBUG(llvm::dbgs() << "Op: " << logicalObjectFifo
+                            << ", number of tiles: " << tiles.size() << "\n");
+
+    SmallVector<std::pair<int64_t, int64_t>> locations =
+        llvm::map_to_vector(tiles, [](AMDAIE::TileOp tile) {
+          return std::make_pair(
+              (int64_t)getConstantIntValue(tile.getCol()).value(),
+              (int64_t)getConstantIntValue(tile.getRow()).value());
+        });
+    if (!locationsToMemref.contains(locations)) {
+      auto allocOp = dyn_cast<memref::AllocOp>(
+          logicalObjectFifo.getMemref().getDefiningOp());
+      rewriter.setInsertionPoint(allocOp);
+      auto newAllocOp =
+          dyn_cast<memref::AllocOp>(rewriter.clone(*allocOp.getOperation()));
+      auto newDeallocOp = rewriter.create<memref::DeallocOp>(
+          rewriter.getUnknownLoc(), newAllocOp);
+      newDeallocOp->moveBefore(&newAllocOp->getBlock()->back());
+      locationsToMemref[locations] = newAllocOp.getResult();
+    }
+    rewriter.setInsertionPoint(logicalObjectFifo);
     rewriter.replaceOpWithNewOp<AMDAIE::LogicalObjectFifoFromMemrefOp>(
         logicalObjectFifo,
         cast<LogicalObjectFifoType>(logicalObjectFifo.getOutput().getType()),
-        logicalObjectFifo.getMemref(), tileResults);
-    return success();
-  }
-};
+        locationsToMemref[locations], logicalObjectFifo.getTiles());
+    return WalkResult::advance();
+  });
+  return success();
+}
 
 class AMDAIEDistributeCoresAndObjectFifosPass
     : public impl::AMDAIEDistributeCoresAndObjectFifosBase<
@@ -488,33 +885,92 @@ class AMDAIEDistributeCoresAndObjectFifosPass
 void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() {
   MLIRContext *context = &getContext();
   ModuleOp moduleOp = getOperation();
+
   // Convert local scf.forall operations selected for parallel distribution to
   // nested scf.for operations.
   if (failed(localForallToFor(moduleOp))) {
+    moduleOp.emitOpError()
+        << "local `scf.forall` to `scf.for` conversion failed";
     return signalPassFailure();
   }
+
   // Hoist the affine apply ops on scf.for induction variables to the
   // corresponding scf.for's body.
   if (failed(hoistAffineApplyDependingOnFor(moduleOp))) {
+    moduleOp.emitOpError() << "`affine.apply` hoisting failed";
     return signalPassFailure();
   }
+  LLVM_DEBUG(llvm::dbgs() << "Module after localForallToFor: \n"
+                          << moduleOp << "\n");
+
+  if (failed(distributeLocalMemory(moduleOp))) {
+    moduleOp.emitOpError() << "local memory distribution failed";
+    return signalPassFailure();
+  }
+  LLVM_DEBUG(llvm::dbgs() << "Module after distributeLocalMemory: \n"
+                          << moduleOp << "\n");
+
   // Unroll local parallel loops and try hoisting dma operations if
   // possible.
   RewritePatternSet unrollLocalLoopsPatterns(context);
   unrollLocalLoopsPatterns.insert<AMDAIEUnrollLocalLoops>(context);
   if (failed(applyPatternsAndFoldGreedily(
           moduleOp, std::move(unrollLocalLoopsPatterns)))) {
+    moduleOp.emitOpError()
+        << "loop unrolling of loops selected for parallel execution failed";
+    return signalPassFailure();
+  }
+  LLVM_DEBUG(llvm::dbgs() << "Module after AMDAIEUnrollLocalLoops: \n"
+                          << moduleOp << "\n");
+
+  // Insert `amdaie.logicalobjectfifo.access` operations which retrieve the
+  // memrefs from logical objectfifos and update the computational operations to
+  // operate on these local memrefs. These access operations will be used to
+  // assign local AIE tiles to local logical objectFifos later.
+  if (failed(insertLogicalObjectFifoAccess(moduleOp))) {
+    moduleOp.emitOpError()
+        << "insertion of `amdaie.logicalobjectfif.access` operations failed";
     return signalPassFailure();
   }
+  LLVM_DEBUG(llvm::dbgs() << "Module after insertLogicalObjectFifoAccess: \n"
+                          << moduleOp << "\n");
+
   // Assign tile locations to logical objectfifos on local (L1) memory.
   if (failed(assignLocalAieTiles(moduleOp))) {
+    moduleOp.emitOpError() << "local tile assignment failed";
     return signalPassFailure();
   }
-  // Assign tile locations to the remaining logical objectfifos.
+  LLVM_DEBUG(llvm::dbgs() << "Module after assignLocalAieTiles: \n"
+                          << moduleOp << "\n");
+
+  // Assign a set of potential tile locations to the remaining logical
+  // objectFifos.
   RewritePatternSet assignAieTilePatters(context);
-  assignAieTilePatters.insert<AssignAieTiles>(context);
+  assignAieTilePatters.insert<FillAieTiles>(context);
   if (failed(applyPatternsAndFoldGreedily(moduleOp,
                                           std::move(assignAieTilePatters)))) {
+    moduleOp.emitOpError()
+        << "collection of tile candidates for logical objectFifos failed";
+    return signalPassFailure();
+  }
+  LLVM_DEBUG(llvm::dbgs() << "Module after FillAieTiles: \n"
+                          << moduleOp << "\n");
+
+  // Assign specific tile locations to objectFifos, starting from the set of
+  // potential tile locations filled in earlier.
+  if (failed(assignAieTilesAndDistributeLogicalObjectFifos(moduleOp))) {
+    moduleOp.emitOpError()
+        << "tile assignment and logical objectFifo distribution failed";
+    return signalPassFailure();
+  }
+  LLVM_DEBUG(llvm::dbgs()
+             << "Module after assignAieTilesAndDistributeLogicalObjectFifos: \n"
+             << moduleOp << "\n");
+
+  // Allocate different memories for logical objectFifos on the same shared
+  // memory tile to ensure different buffers will be used for them.
+  if (failed(distributeSharedMemory(moduleOp))) {
+    moduleOp.emitOpError() << "distribution of shared memory failed";
     return signalPassFailure();
   }
 }
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
index bb96c9f4d..64f0f39d8 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp
@@ -264,6 +264,12 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp,
   auto walkResult = aieCoreOp.walk([&](Operation *op) {
     rewriter.setInsertionPoint(op);
     if (TypeSwitch<Operation *, LogicalResult>(op)
+            .Case<AMDAIE::LogicalObjectFifoAccessOp>([&](auto accessOp) {
+              // TODO(jornt): Temporary until access operations are used for
+              // inserting synchronization stubs instead of consume/produce.
+              rewriter.eraseOp(accessOp);
+              return success();
+            })
             .Case<AMDAIE::LogicalObjectFifoAcquire>([&](auto acquireOp) {
               return acquireOpToAIE(rewriter, acquireOp, mapper,
                                     localMemrefMapper);
diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir
index 95e046396..3f2c9d123 100644
--- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir
+++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir
@@ -1,4 +1,4 @@
-// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file %s | FileCheck %s
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s
 
 // Check for unrolling an amdaie.core within a parallel loop with a single
 // induction variable with multiple iterations. There are no dma ops in this
@@ -10,30 +10,24 @@
 // CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
 // CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
 // CHECK:       scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) {
-// CHECK:         amdaie.workgroup {
-// CHECK:           %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK:           %{{.*}} = amdaie.core(%[[TILE_0]])
-// CHECK:           %[[TILE_1:.*]] = amdaie.tile(%[[C1]], %[[C2]])
-// CHECK:           %{{.*}} = amdaie.core(%[[TILE_1]])
-// CHECK:           %[[TILE_2:.*]] = amdaie.tile(%[[C2]], %[[C2]])
-// CHECK:           %{{.*}} = amdaie.core(%[[TILE_2]])
-// CHECK:           %[[TILE_3:.*]] = amdaie.tile(%[[C3]], %[[C2]])
-// CHECK:           %{{.*}} = amdaie.core(%[[TILE_3]])
+// CHECK:         %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK:         %{{.*}} = amdaie.core(%[[TILE_0]])
+// CHECK:         %[[TILE_1:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+// CHECK:         %{{.*}} = amdaie.core(%[[TILE_1]])
+// CHECK:         %[[TILE_2:.*]] = amdaie.tile(%[[C2]], %[[C2]])
+// CHECK:         %{{.*}} = amdaie.core(%[[TILE_2]])
+// CHECK:         %[[TILE_3:.*]] = amdaie.tile(%[[C3]], %[[C2]])
+// CHECK:         %{{.*}} = amdaie.core(%[[TILE_3]])
 module {
   func.func @distribute_cores_and_objectfifos_1x4() {
     %c2 = arith.constant 2 : index
     scf.forall (%arg0, %arg1) in (1, 1) {
-      amdaie.workgroup {
-        scf.forall (%arg2, %arg3) in (1, 4) {
-          %tile = amdaie.tile(%arg3, %c2)
-          %21 = amdaie.core(%tile) {
-            amdaie.end
-          }
-        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-        amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (1, 4) {
+        %tile = amdaie.tile(%arg3, %c2)
+        %21 = amdaie.core(%tile) {
           amdaie.end
         }
-      }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     return
   }
@@ -48,29 +42,23 @@ module {
 // CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK:       scf.forall
-// CHECK:         amdaie.workgroup {
-// CHECK-DAG:       %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:       %[[CORE_0_0:.*]] = amdaie.core(%[[TILE_0_0]])
-// CHECK-DAG:       %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK-DAG:       %[[CORE_0_1:.*]] = amdaie.core(%[[TILE_0_1]])
-// CHECK-DAG:       %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]])
-// CHECK-DAG:       %[[CORE_1_0:.*]] = amdaie.core(%[[TILE_1_0]])
-// CHECK-DAG:       %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]])
-// CHECK-DAG:       %[[CORE_1_1:.*]] = amdaie.core(%[[TILE_1_1]])
+// CHECK-DAG:     %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK-DAG:     %[[CORE_0_0:.*]] = amdaie.core(%[[TILE_0_0]])
+// CHECK-DAG:     %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[CORE_0_1:.*]] = amdaie.core(%[[TILE_0_1]])
+// CHECK-DAG:     %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]])
+// CHECK-DAG:     %[[CORE_1_0:.*]] = amdaie.core(%[[TILE_1_0]])
+// CHECK-DAG:     %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]])
+// CHECK-DAG:     %[[CORE_1_1:.*]] = amdaie.core(%[[TILE_1_1]])
 module {
   func.func @distribute_cores_and_objectfifos_2x2() {
     scf.forall (%arg0, %arg1) in (1, 1) {
-      amdaie.workgroup {
-        scf.forall (%arg2, %arg3) in (2, 2) {
-          %tile = amdaie.tile(%arg3, %arg2)
-          %0 = amdaie.core(%tile) {
-            amdaie.end
-          }
-        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-        amdaie.controlcode {
+      scf.forall (%arg2, %arg3) in (2, 2) {
+        %tile = amdaie.tile(%arg3, %arg2)
+        %0 = amdaie.core(%tile) {
           amdaie.end
         }
-      }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     return
   }
@@ -86,46 +74,47 @@ module {
 // CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
 // CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1>
-// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2>
+// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<32x1024xi32, 1>
+// CHECK-DAG:   %[[ALLOC_2:.*]] = memref.alloc() : memref<32x64xi32, 2>
 // CHECK:       scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) {
-// CHECK:         amdaie.workgroup {
-// CHECK-DAG:       %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK-DAG:       %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
-// CHECK-DAG:       %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK-DAG:       %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]])
-// CHECK-DAG:       %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_1]]}
-// CHECK-DAG:       %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
-// CHECK-DAG:       %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_2]]}
-// CHECK-DAG:       %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]}
-// CHECK-DAG:       %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]]
-// CHECK-SAME:      %[[FROM_MEMREF_1]]
-// CHECK-DAG:       %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]])
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:       %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
-// CHECK-SAME:      %[[FROM_MEMREF_0]]
-// CHECK-DAG:       %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]])
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK-DAG:     %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+// CHECK-DAG:     %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]])
+// CHECK-DAG:     %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_1_2]]}
+// CHECK-DAG:     %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_2]]}
+// CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]]
+// CHECK-SAME:    %[[FROM_MEMREF_1]]
+// CHECK-DAG:     %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
+// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK:           linalg.fill ins(%{{.+}} : i32)
+// CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
+// CHECK-SAME:    %[[FROM_MEMREF_0]]
+// CHECK-DAG:     %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]])
+// CHECK:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
+// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK:           linalg.fill ins(%{{.+}} : i32)
 module {
   func.func @unroll_dma() {
+    %c0_i32 = arith.constant 0 : i32
     %c2 = arith.constant 2 : index
     %alloc = memref.alloc() : memref<32x1024xi32, 1>
     %alloc_1 = memref.alloc() : memref<32x64xi32, 2>
     scf.forall (%arg0, %arg1) in (1, 1) {
-      amdaie.workgroup {
-        %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>
-        %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
-        scf.forall (%arg2, %arg3) in (1, 2) {
-          %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3, %arg3] [%arg3, %arg3] [%arg3, %arg3]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
-          %tile = amdaie.tile(%arg3, %c2)
-          %3 = amdaie.core(%tile) {
-            amdaie.logicalobjectfifo.consume(%2)
-            amdaie.end
-          }
-        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-        amdaie.controlcode {
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
+      scf.forall (%arg2, %arg3) in (1, 2) {
+        %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3, %arg3] [%arg3, %arg3] [%arg3, %arg3]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
+        %tile = amdaie.tile(%arg3, %c2)
+        %3 = amdaie.core(%tile) {
+          amdaie.logicalobjectfifo.consume(%2)
+          linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
           amdaie.end
         }
-      }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_1 : memref<32x64xi32, 2>
     memref.dealloc %alloc : memref<32x1024xi32, 1>
@@ -146,41 +135,39 @@ module {
 // CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1>
 // CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2>
 // CHECK:       scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) {
-// CHECK:         amdaie.workgroup {
-// CHECK-DAG:       %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK-DAG:       %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
-// CHECK-DAG:       %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK-DAG:       %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
-// CHECK-DAG:       %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]]
-// CHECK-SAME:      %[[TILE_1_2]]
-// CHECK-SAME:      %[[TILE_0_2]]
-// CHECK-DAG:       %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
-// CHECK-SAME:      %[[FROM_MEMREF_0]]
-// CHECK-DAG:       %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]])
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:       %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]])
-// CHECK:             amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK-DAG:     %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+// CHECK-DAG:     %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_1_2]]}
+// CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
+// CHECK-SAME:    %[[FROM_MEMREF_0]]
+// CHECK-DAG:     %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
+// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK:           linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:     %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]])
+// CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
+// CHECK:           amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK:           linalg.fill ins(%{{.+}} : i32) outs
 module {
   func.func @hoist_dma_single_loop() {
+    %c0_i32 = arith.constant 0 : i32
     %c2 = arith.constant 2 : index
     %alloc = memref.alloc() : memref<32x1024xi32, 1>
     %alloc_1 = memref.alloc() : memref<32x64xi32, 2>
     scf.forall (%arg0, %arg1) in (1, 1) {
-      amdaie.workgroup {
-        %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>
-        %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
-        scf.forall (%arg2, %arg3) in (1, 2) {
-          %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
-          %tile = amdaie.tile(%arg3, %c2)
-          %3 = amdaie.core(%tile) {
-            amdaie.logicalobjectfifo.consume(%2)
-            amdaie.end
-          }
-        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-        amdaie.controlcode {
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
+      scf.forall (%arg2, %arg3) in (1, 2) {
+        %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
+        %tile = amdaie.tile(%arg3, %c2)
+        %3 = amdaie.core(%tile) {
+          amdaie.logicalobjectfifo.consume(%2)
+          linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
           amdaie.end
         }
-      }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_1 : memref<32x64xi32, 2>
     memref.dealloc %alloc : memref<32x1024xi32, 1>
@@ -198,7 +185,7 @@ module {
 // be hoisted. To check this, we use `CHECK-NOT: amdaie.dma_cpy_nd` after
 // already encountered once.
 //
-// CHECK-LABEL: @hoist_dma_and_affine_single_loop
+// CHECK-LABEL: @hoist_dma_and_affine_single_loop_2x1
 // CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 // CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
 // CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
@@ -206,42 +193,94 @@ module {
 // CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1>
 // CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2>
 // CHECK:       scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) {
-// CHECK:         amdaie.workgroup {
-// CHECK-DAG:       %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK-DAG:       %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
-// CHECK-DAG:       %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK-DAG:       %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
-// CHECK-DAG:       %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]]
-// CHECK-SAME:      %[[TILE_0_3]]
-// CHECK-SAME:      %[[TILE_0_2]]
-// CHECK-DAG:       %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
-// CHECK-NOT:       amdaie.dma_cpy_nd
-// CHECK-DAG:       amdaie.core(%[[TILE_0_2]])
-// CHECK-DAG:       amdaie.core(%[[TILE_0_3]])
+// CHECK-DAG:     %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
+// CHECK-DAG:     %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]]}
+// CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
+// CHECK-NOT:     amdaie.dma_cpy_nd
+// CHECK-DAG:     amdaie.core(%[[TILE_0_2]])
+// CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
+// CHECK-DAG:     amdaie.core(%[[TILE_0_3]])
+// CHECK:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
 #map = affine_map<(d0) -> (d0 * 32)>
 module {
-  func.func @hoist_dma_and_affine_single_loop() {
+  func.func @hoist_dma_and_affine_single_loop_2x1() {
+    %c0_i32 = arith.constant 0 : i32
     %alloc = memref.alloc() : memref<32x1024xi32, 1>
     %alloc_1 = memref.alloc() : memref<32x64xi32, 2>
     scf.forall (%arg0, %arg1) in (1, 1) {
-      amdaie.workgroup {
-        %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>
-        %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
-        scf.forall (%arg2, %arg3) in (2, 1) {
-          %c2 = arith.constant 2 : index
-          %apply = affine.apply #map(%arg3)
-          %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%apply] [%c2] [%c2]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
-          %add = arith.addi %arg2, %c2 : index
-          %tile = amdaie.tile(%arg3, %add)
-          %3 = amdaie.core(%tile) {
-            amdaie.logicalobjectfifo.consume(%2)
-            amdaie.end
-          }
-        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-        amdaie.controlcode {
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
+      scf.forall (%arg2, %arg3) in (2, 1) {
+        %c2 = arith.constant 2 : index
+        %apply = affine.apply #map(%arg3)
+        %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%apply] [%c2] [%c2]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
+        %add = arith.addi %arg2, %c2 : index
+        %tile = amdaie.tile(%arg3, %add)
+        %3 = amdaie.core(%tile) {
+          amdaie.logicalobjectfifo.consume(%2)
+          linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
           amdaie.end
         }
-      }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc_1 : memref<32x64xi32, 2>
+    memref.dealloc %alloc : memref<32x1024xi32, 1>
+    return
+  }
+}
+
+// -----
+
+// Check for unrolling a parallel loop, with both cores and dma ops. The dma op
+// does depend on one of the induction variables and can't be hoisted. However,
+// in this test, the DMA operation does depend on an affine apply operation
+// within the `scf.for` operation's scope and checks whether both the affine
+// apply and the DMA can be unrolled correctly.
+//
+// CHECK-LABEL: @unroll_dma_and_affine_single_loop
+// CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
+// CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1>
+// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2>
+// CHECK:       scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) {
+// CHECK-DAG:     %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
+// CHECK-DAG:     %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_3]]}
+// CHECK-DAG:     %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]}
+// CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
+// CHECK-DAG:     amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
+// CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
+// CHECK-DAG:     amdaie.core(%[[TILE_0_3]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
+#map = affine_map<(d0) -> (d0 * 32)>
+module {
+  func.func @unroll_dma_and_affine_single_loop() {
+    %c0_i32 = arith.constant 0 : i32
+    %alloc = memref.alloc() : memref<32x1024xi32, 1>
+    %alloc_1 = memref.alloc() : memref<32x64xi32, 2>
+    scf.forall (%arg0, %arg1) in (1, 1) {
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
+      scf.forall (%arg2, %arg3) in (2, 1) {
+        %c2 = arith.constant 2 : index
+        %apply = affine.apply #map(%arg2)
+        %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%apply] [%c2] [%c2]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
+        %add = arith.addi %arg2, %c2 : index
+        %tile = amdaie.tile(%arg3, %add)
+        %3 = amdaie.core(%tile) {
+          amdaie.logicalobjectfifo.consume(%2)
+          linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
+          amdaie.end
+        }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_1 : memref<32x64xi32, 2>
     memref.dealloc %alloc : memref<32x1024xi32, 1>
@@ -263,50 +302,46 @@ module {
 // CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1>
 // CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2>
 // CHECK:       scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) {
-// CHECK:         amdaie.workgroup {
-// CHECK-DAG:       %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK-DAG:       %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
-// CHECK-DAG:       %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
-// CHECK-DAG:       %[[TILE_1_3:.*]] = amdaie.tile(%[[C1]], %[[C3]])
-// CHECK-DAG:       %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK-DAG:       %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
-// CHECK-DAG:       %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]]
-// CHECK-SAME:      %[[TILE_1_3]]
-// CHECK-SAME:      %[[TILE_0_3]]
-// CHECK-SAME:      %[[TILE_1_2]]
-// CHECK-SAME:      %[[TILE_0_2]]
-// CHECK-DAG:       %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
-// CHECK-SAME:      %[[FROM_MEMREF_0]]
-// CHECK-DAG:       %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:       %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:       %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:       %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK-DAG:     %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
+// CHECK-DAG:     %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+// CHECK-DAG:     %[[TILE_1_3:.*]] = amdaie.tile(%[[C1]], %[[C3]])
+// CHECK-DAG:     %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]], %[[TILE_1_2]], %[[TILE_1_3]]}
+// CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
+// CHECK-SAME:    %[[FROM_MEMREF_0]]
+// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
 module {
   func.func @hoist_dma_multi_loop() {
+    %c0_i32 = arith.constant 0 : i32
     %c2 = arith.constant 2 : index
     %alloc = memref.alloc() : memref<32x1024xi32, 1>
     %alloc_1 = memref.alloc() : memref<32x64xi32, 2>
     scf.forall (%arg0, %arg1) in (1, 1) {
-      amdaie.workgroup {
-        %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>
-        %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
-        scf.forall (%arg2, %arg3) in (2, 2) {
-          %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
-          %add = arith.addi %arg2, %c2 : index
-          %tile = amdaie.tile(%arg3, %add)
-          %3 = amdaie.core(%tile) {
-            amdaie.logicalobjectfifo.consume(%2)
-            amdaie.end
-          }
-        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-        amdaie.controlcode {
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
+      scf.forall (%arg2, %arg3) in (2, 2) {
+        %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
+        %add = arith.addi %arg2, %c2 : index
+        %tile = amdaie.tile(%arg3, %add)
+        %3 = amdaie.core(%tile) {
+          amdaie.logicalobjectfifo.consume(%2)
+          linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
           amdaie.end
         }
-      }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_1 : memref<32x64xi32, 2>
     memref.dealloc %alloc : memref<32x1024xi32, 1>
@@ -326,57 +361,50 @@ module {
 // CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
 // CHECK-DAG:   %[[C3:.*]] = arith.constant 3 : index
 // CHECK-DAG:   %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1>
-// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2>
+// CHECK-DAG:   %[[ALLOC_1:.*]] = memref.alloc() : memref<32x1024xi32, 1>
+// CHECK-DAG:   %[[ALLOC_2:.*]] = memref.alloc() : memref<32x64xi32, 2>
 // CHECK:       scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) {
-// CHECK:         amdaie.workgroup {
-// CHECK-DAG:       %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK-DAG:       %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
-// CHECK-DAG:       %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
-// CHECK-DAG:       %[[TILE_1_3:.*]] = amdaie.tile(%[[C1]], %[[C3]])
-// CHECK-DAG:       %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK-DAG:       %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]])
-// CHECK-DAG:       %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_1]]}
-// CHECK-DAG:       %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
-// CHECK-DAG:       %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]]
-// CHECK-SAME:      %[[TILE_1_3]]
-// CHECK-SAME:      %[[TILE_1_2]]
-// CHECK-DAG:       %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]]
-// CHECK-SAME:      %[[TILE_0_3]]
-// CHECK-SAME:      %[[TILE_0_2]]
-// CHECK-DAG:       %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]]
-// CHECK-SAME:      %[[FROM_MEMREF_1]]
-// CHECK-DAG:       %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
-// CHECK-SAME:      %[[FROM_MEMREF_0]]
-// CHECK-DAG:       %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:       %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_1]])
-// CHECK-DAG:       %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_0]])
-// CHECK-DAG:       %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK-DAG:     %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]])
+// CHECK-DAG:     %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]])
+// CHECK-DAG:     %[[TILE_1_3:.*]] = amdaie.tile(%[[C1]], %[[C3]])
+// CHECK-DAG:     %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]])
+// CHECK-DAG:     %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_1_2]], %[[TILE_1_3]]}
+// CHECK-DAG:     %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_2]], %[[TILE_0_3]]}
+// CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]]
+// CHECK-SAME:    %[[FROM_MEMREF_1]]
+// CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
+// CHECK-SAME:    %[[FROM_MEMREF_0]]
+// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_0]])
+// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_1]])
 module {
   func.func @hoist_dma_one_of_multi_loop() {
+    %c0_i32 = arith.constant 0 : i32
     %c2 = arith.constant 2 : index
     %alloc = memref.alloc() : memref<32x1024xi32, 1>
     %alloc_1 = memref.alloc() : memref<32x64xi32, 2>
     scf.forall (%arg0, %arg1) in (1, 1) {
-      amdaie.workgroup {
-        %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>
-        %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
-        scf.forall (%arg2, %arg3) in (2, 2) {
-          %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3] [%arg3] [%arg3]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
-          %add = arith.addi %arg2, %c2 : index
-          %tile = amdaie.tile(%arg3, %add)
-          %3 = amdaie.core(%tile) {
-            amdaie.logicalobjectfifo.consume(%2)
-            amdaie.end
-          }
-        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-        amdaie.controlcode {
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
+      scf.forall (%arg2, %arg3) in (2, 2) {
+        %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3] [%arg3] [%arg3]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x1024xi32, 1>>)
+        %add = arith.addi %arg2, %c2 : index
+        %tile = amdaie.tile(%arg3, %add)
+        %3 = amdaie.core(%tile) {
+          amdaie.logicalobjectfifo.consume(%2)
+          linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>)
           amdaie.end
         }
-      }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_1 : memref<32x64xi32, 2>
     memref.dealloc %alloc : memref<32x1024xi32, 1>
@@ -398,64 +426,69 @@ module {
 // CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
 // CHECK-DAG:   %[[ALLOC_0:.+]] = memref.alloc() : memref<32x1024xi32>
 // CHECK-DAG:   %[[ALLOC_1:.+]] = memref.alloc() : memref<32x64xi32, 1>
-// CHECK-DAG:   %[[ALLOC_2:.+]] = memref.alloc() : memref<32x64xi32, 2>
+// CHECK-DAG:   %[[ALLOC_2:.+]] = memref.alloc() : memref<32x64xi32, 1>
+// CHECK-DAG:   %[[ALLOC_3:.+]] = memref.alloc() : memref<32x64xi32, 2>
 // CHECK:       scf.forall (%[[ARG0:.+]], %[[ARG1:.+]]) in (1, 1) {
-// CHECK:         amdaie.workgroup {
-// CHECK-DAG:       %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]])
-// CHECK-DAG:       %[[TILE_0_3:.+]] = amdaie.tile(%[[C0]], %[[C3]])
-// CHECK-DAG:       %[[TILE_1_2:.+]] = amdaie.tile(%[[C1]], %[[C2]])
-// CHECK-DAG:       %[[TILE_1_3:.+]] = amdaie.tile(%[[C1]], %[[C3]])
-// CHECK-DAG:       %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
-// CHECK-DAG:       %[[TILE_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]])
-// CHECK-DAG:       %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]])
-// CHECK-DAG:       %[[TILE_1_1:.+]] = amdaie.tile(%[[C1]], %[[C1]])
-// CHECK-DAG:       %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_0]]}
-// CHECK-DAG:       %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]}
-// CHECK-DAG:       %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_1]]}
-// CHECK-DAG:       %[[FROM_MEMREF_3:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]}
-// CHECK-DAG:       %[[FROM_MEMREF_4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_1_3]], %[[TILE_1_2]]}
-// CHECK-DAG:       %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_3]], %[[TILE_0_2]]}
-// CHECK-DAG:       %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]]
-// CHECK-SAME:      %[[FROM_MEMREF_1]]
-// CHECK-DAG:       %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_5]]
-// CHECK-SAME:      %[[FROM_MEMREF_3]]
-// CHECK-DAG:       %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
-// CHECK-SAME:      %[[FROM_MEMREF_0]]
-// CHECK-DAG:       %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]]
-// CHECK-SAME:      %[[FROM_MEMREF_2]]
-// CHECK-DAG:       %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_1]])
-// CHECK-DAG:       %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_3]])
-// CHECK-DAG:       %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_1]])
-// CHECK-DAG:       %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
-// CHECK-DAG:         amdaie.logicalobjectfifo.consume(%[[DMA_3]])
+// CHECK-DAG:     %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     %[[TILE_0_3:.+]] = amdaie.tile(%[[C0]], %[[C3]])
+// CHECK-DAG:     %[[TILE_1_2:.+]] = amdaie.tile(%[[C1]], %[[C2]])
+// CHECK-DAG:     %[[TILE_1_3:.+]] = amdaie.tile(%[[C1]], %[[C3]])
+// CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK-DAG:     %[[TILE_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]])
+// CHECK-DAG:     %[[TILE_1_1:.+]] = amdaie.tile(%[[C1]], %[[C1]])
+// CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_0]]}
+// CHECK-DAG:     %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]}
+// CHECK-DAG:     %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_3:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_2]], %[[TILE_1_3]]}
+// CHECK-DAG:     %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_2]], %[[TILE_0_3]]}
+// CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]]
+// CHECK-SAME:    %[[FROM_MEMREF_1]]
+// CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_5]]
+// CHECK-SAME:    %[[FROM_MEMREF_3]]
+// CHECK-DAG:     %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
+// CHECK-SAME:    %[[FROM_MEMREF_0]]
+// CHECK-DAG:     %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]]
+// CHECK-SAME:    %[[FROM_MEMREF_2]]
+// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_3]])
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_3]])
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
 module {
   func.func @hoist_dma_dependencies() {
+    %c0_i32 = arith.constant 0 : i32
     %c2 = arith.constant 2 : index
     %alloc = memref.alloc() : memref<32x1024xi32>
     %alloc_1 = memref.alloc() : memref<32x64xi32, 1>
     %alloc_2 = memref.alloc() : memref<32x64xi32, 2>
     scf.forall (%arg0, %arg1) in (1, 1) {
-      amdaie.workgroup {
-        %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32> -> !amdaie.logicalobjectfifo<memref<32x1024xi32>>
-        %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 1>>
-        %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
-        scf.forall (%arg2, %arg3) in (2, 2) {
-          %3 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3] [%arg3] [%arg3]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 1>>, !amdaie.logicalobjectfifo<memref<32x1024xi32>>)
-          %4 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x64xi32, 1>>)
-          %add = arith.addi %arg2, %c2 : index
-          %tile = amdaie.tile(%arg3, %add)
-          %core = amdaie.core(%tile) {
-            amdaie.logicalobjectfifo.consume(%4)
-            amdaie.end
-          }
-        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-        amdaie.controlcode {
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32> -> !amdaie.logicalobjectfifo<memref<32x1024xi32>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 1>>
+      %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
+      scf.forall (%arg2, %arg3) in (2, 2) {
+        %3 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3] [%arg3] [%arg3]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 1>>, !amdaie.logicalobjectfifo<memref<32x1024xi32>>)
+        %4 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x64xi32, 1>>)
+        %add = arith.addi %arg2, %c2 : index
+        %tile = amdaie.tile(%arg3, %add)
+        %core = amdaie.core(%tile) {
+          amdaie.logicalobjectfifo.consume(%4)
+          linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<32x64xi32, 2>)
           amdaie.end
         }
-      }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_2 : memref<32x64xi32, 2>
     memref.dealloc %alloc_1 : memref<32x64xi32, 1>
@@ -466,68 +499,182 @@ module {
 
 // -----
 
-// CHECK-LABEL: @distribute_cores_and_objectfifos
+// Check dependencies of DMAs on preceding DMAs at different loop levels.
+//
+// CHECK-LABEL: @nested_dma_dependencies
+// CHECK-DAG:   %[[C0:.+]] = arith.constant 0 : index
+// CHECK-DAG:   %[[C1:.+]] = arith.constant 1 : index
+// CHECK-DAG:   %[[C2:.+]] = arith.constant 2 : index
+// CHECK-DAG:   %[[C3:.+]] = arith.constant 3 : index
+// CHECK-DAG:   %[[ALLOC_0:.+]] = memref.alloc() : memref<32x1024xi32>
+// CHECK-DAG:   %[[ALLOC_1:.+]] = memref.alloc() : memref<32x128xi32, 1>
+// CHECK-DAG:   %[[ALLOC_2:.+]] = memref.alloc() : memref<32x64xi32, 2>
+// CHECK-DAG:   %[[ALLOC_3:.+]] = memref.alloc() : memref<32x32xi32, 2>
+// CHECK-DAG:   %[[ALLOC_4:.+]] = memref.alloc() : memref<2x2x32x32xi32, 1>
+// CHECK-DAG:   %[[ALLOC_5:.+]] = memref.alloc() : memref<64x64xi32>
+// CHECK:       scf.forall (%{{.+}}, %[[ARG1:.+]]) in (2, 2)
+// CHECK-DAG:     %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]])
+// CHECK-DAG:     %[[TILE_0_3:.+]] = amdaie.tile(%[[C0]], %[[C3]])
+// CHECK-DAG:     %[[TILE_1_2:.+]] = amdaie.tile(%[[C1]], %[[C2]])
+// CHECK-DAG:     %[[TILE_1_3:.+]] = amdaie.tile(%[[C1]], %[[C3]])
+// CHECK-DAG:     %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]])
+// CHECK-DAG:     %[[TILE_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]])
+// CHECK-DAG:     %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]}
+// CHECK-DAG:     %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_3]], %[[TILE_1_3]]}
+// CHECK-DAG:     %[[FROM_MEMREF_3:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_2]], %[[TILE_1_2]]}
+// CHECK-DAG:     %[[FROM_MEMREF_4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_2]]}
+// CHECK-DAG:     %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_2]]}
+// CHECK-DAG:     %[[FROM_MEMREF_6:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_3]]}
+// CHECK-DAG:     %[[FROM_MEMREF_7:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_3]]}
+// CHECK-DAG:     %[[FROM_MEMREF_8:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_0_1]]}
+// CHECK-DAG:     %[[FROM_MEMREF_9:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_0_0]]}
+// CHECK-DAG:     %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]][] [] [], %[[FROM_MEMREF_0]][%[[ARG1]]]
+// CHECK-DAG:     %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]][] [] [], %[[FROM_MEMREF_1]]
+// CHECK-DAG:     %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_4]]
+// CHECK-DAG:     %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Write)
+// CHECK-DAG:       %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:     %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_5]]
+// CHECK-DAG:     %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write)
+// CHECK-DAG:       %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:     %[[DMA_4:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]][] [] [], %[[FROM_MEMREF_1]]
+// CHECK-DAG:     %[[DMA_5:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_6]]
+// CHECK-DAG:     %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_6]], Write)
+// CHECK-DAG:       %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_4]])
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:     %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_7]]
+// CHECK-DAG:     %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]])
+// CHECK-DAG:       %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Write)
+// CHECK-DAG:       %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read)
+// CHECK-DAG:       amdaie.logicalobjectfifo.consume(%[[DMA_4]])
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:       linalg.fill ins(%{{.+}} : i32) outs
+// CHECK-DAG:     %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]][%[[ARG1]]] [%c1] [%c1], %[[FROM_MEMREF_8]]
+module {
+  func.func @nested_dma_dependencies() {
+    %c0_i32 = arith.constant 0 : i32
+    %c1 = arith.constant 1 : index
+    %c2 = arith.constant 2 : index
+    %alloc = memref.alloc() : memref<32x1024xi32>
+    %alloc_1 = memref.alloc() : memref<32x128xi32, 1>
+    %alloc_2 = memref.alloc() : memref<32x64xi32, 2>
+    %alloc_3 = memref.alloc() : memref<32x32xi32, 2>
+    %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1>
+    %alloc_5 = memref.alloc() : memref<64x64xi32>
+    scf.forall (%arg0, %arg1) in (2, 2) {
+      %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32> -> !amdaie.logicalobjectfifo<memref<32x1024xi32>>
+      %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x128xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x128xi32, 1>>
+      %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 2>>
+      %3 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<32x32xi32, 2> -> !amdaie.logicalobjectfifo<memref<32x32xi32, 2>>
+      %4 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1>>
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<64x64xi32> -> !amdaie.logicalobjectfifo<memref<64x64xi32>>
+      %6 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg1] [%c1] [%c1]) : (!amdaie.logicalobjectfifo<memref<32x128xi32, 1>>, !amdaie.logicalobjectfifo<memref<32x1024xi32>>)
+      scf.forall (%arg2, %arg3) in (2, 2) {
+        %7 = amdaie.dma_cpy_nd(%2[] [] [], %1[%arg2] [%c1] [%c1]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x128xi32, 1>>)
+        %8 = amdaie.dma_cpy_nd(%4[%arg2, %arg3] [%c1, %c1] [%c1, %c1], %3[] [] []) : (!amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<32x32xi32, 2>>)
+        %add = arith.addi %arg2, %c2 : index
+        %tile = amdaie.tile(%arg3, %add)
+        %core = amdaie.core(%tile) {
+          amdaie.logicalobjectfifo.consume(%7)
+          linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<32x64xi32, 2>)
+          linalg.fill ins(%c0_i32 : i32) outs(%alloc_3 : memref<32x32xi32, 2>)
+          amdaie.logicalobjectfifo.produce(%8)
+          amdaie.end
+        }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
+      %9 = amdaie.dma_cpy_nd(%5[%arg1] [%c1] [%c1], %4[] [] []) : (!amdaie.logicalobjectfifo<memref<64x64xi32>>, !amdaie.logicalobjectfifo<memref<2x2x32x32xi32, 1>>)
+    } {mapping = [#gpu.block<y>, #gpu.block<x>]}
+    memref.dealloc %alloc_5 : memref<64x64xi32>
+    memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1>
+    memref.dealloc %alloc_3 : memref<32x32xi32, 2>
+    memref.dealloc %alloc_2 : memref<32x64xi32, 2>
+    memref.dealloc %alloc_1 : memref<32x128xi32, 1>
+    memref.dealloc %alloc : memref<32x1024xi32>
+    return
+  }
+}
+
+// -----
+
+// CHECK-LABEL:   @distribute_cores_and_objectfifos
 // CHECK-DAG:       %[[IN_B:.*]] = hal.interface.binding.subspan set(0) binding(1)
 // CHECK-DAG:       %[[IN_A:.*]] = hal.interface.binding.subspan set(0) binding(0)
 // CHECK-DAG:       %[[OUTPUT:.*]] = hal.interface.binding.subspan set(0) binding(2)
 // CHECK-DAG:       %[[ALLOC:.*]] = memref.alloc() : memref<4x8x8x8xi32, 2>
 // CHECK-DAG:       %[[ALLOC_0:.*]] = memref.alloc() : memref<8x8x4x8xi32, 2>
 // CHECK-DAG:       %[[ALLOC_1:.*]] = memref.alloc() : memref<4x8x4x8xi32, 2>
-// CHECK-DAG:       %[[ALLOC_2:.*]] = memref.alloc() : memref<32x32xi32, 1>
+// CHECK-DAG:       %[[ALLOC_2:.*]] = memref.alloc() : memref<32x64xi32, 1>
 // CHECK-DAG:       %[[ALLOC_3:.*]] = memref.alloc() : memref<64x32xi32, 1>
-// CHECK-DAG:       %[[ALLOC_4:.*]] = memref.alloc() : memref<32x64xi32, 1>
-// CHECK-DAG:       scf.forall 
-// CHECK-SAME:      in (1, 1)
-// CHECK-DAG:         amdaie.workgroup {
-// CHECK-DAG:           %[[TILE:.*]] = amdaie.tile(%c1, %c2)
-// CHECK-DAG:           %[[TILE_5:.*]] = amdaie.tile(%c0, %c2)
-// CHECK-DAG:           %[[TILE_6:.*]] = amdaie.tile(%c0, %c1)
-// CHECK-DAG:           %[[TILE_7:.*]] = amdaie.tile(%c1, %c1)
-// CHECK-DAG:           %[[TILE_8:.*]] = amdaie.tile(%c1, %c0)
-// CHECK-DAG:           %[[TILE_9:.*]] = amdaie.tile(%c0, %c0)
-// CHECK-DAG:           %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_6]]}
-// CHECK-DAG:           %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_7]]}
-// CHECK-DAG:           %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_6]]}
-// CHECK-DAG:           %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_7]]}
-// CHECK-DAG:           %[[FROM_MEMREF_4:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_6]]}
-// CHECK-DAG:           %[[FROM_MEMREF_5:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE]]}
-// CHECK-DAG:           %[[FROM_MEMREF_6:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_5]]}
-// CHECK-DAG:           %[[FROM_MEMREF_7:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE]], %[[TILE_5]]}
-// CHECK-DAG:           %[[FROM_MEMREF_8:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE]]}
-// CHECK-DAG:           %[[FROM_MEMREF_9:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_5]]}
-// CHECK-DAG:           %[[FROM_MEMREF_10:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_8]]}
-// CHECK-DAG:           %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_9]]}
-// CHECK-DAG:           %[[FROM_MEMREF_12:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_9]]}
-// CHECK-DAG:           %[[FROM_MEMREF_13:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_8]]}
-// CHECK-DAG:           %[[FROM_MEMREF_14:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_9]]}
-// CHECK-DAG:           %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_0]]
-// CHECK-SAME:          %[[FROM_MEMREF_12]]
-// CHECK-DAG:           %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_7]]
-// CHECK-SAME:          %[[FROM_MEMREF_0]]
-// CHECK-DAG:           %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
-// CHECK-SAME:          %[[FROM_MEMREF_14]]
-// CHECK-DAG:           %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]]
-// CHECK-SAME:          %[[FROM_MEMREF_2]]
-// CHECK-DAG:           %[[DMA_4:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]]
-// CHECK-SAME:          %[[FROM_MEMREF_6]]
-// CHECK-DAG:           %[[DMA_5:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_11]]
-// CHECK-SAME:          %[[FROM_MEMREF_4]]
-// CHECK-DAG:           %[[CORE_0:.*]] = amdaie.core(%[[TILE_5]])
-// CHECK-DAG:             amdaie.logicalobjectfifo.consume(%[[DMA_1]])
-// CHECK-DAG:             amdaie.logicalobjectfifo.consume(%[[DMA_3]])
-// CHECK-DAG:             amdaie.logicalobjectfifo.produce(%[[DMA_4]])
-// CHECK-DAG:           %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
-// CHECK-SAME:          %[[FROM_MEMREF_13]]
-// CHECK-DAG:           %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]]
-// CHECK-SAME:          %[[FROM_MEMREF_1]]
-// CHECK-DAG:           %[[DMA_8:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]]
-// CHECK-SAME:          %[[FROM_MEMREF_5]]
-// CHECK-DAG:           %[[DMA_9:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_10]]
-// CHECK-SAME:          %[[FROM_MEMREF_3]]
-// CHECK-DAG:           %[[CORE_1:.*]] = amdaie.core(%[[TILE]])
-// CHECK-DAG:             amdaie.logicalobjectfifo.consume(%[[DMA_1]])
-// CHECK-DAG:             amdaie.logicalobjectfifo.consume(%[[DMA_7]])
-// CHECK-DAG:             amdaie.logicalobjectfifo.produce(%[[DMA_8]])
+// CHECK-DAG:       %[[ALLOC_4:.*]] = memref.alloc() : memref<64x32xi32, 1>
+// CHECK-DAG:       %[[ALLOC_5:.*]] = memref.alloc() : memref<32x32xi32, 1>
+// CHECK-DAG:       %[[ALLOC_6:.*]] = memref.alloc() : memref<32x32xi32, 1>
+// CHECK-DAG:       scf.forall (%{{.+}}, %{{.+}}) in (1, 1)
+// CHECK-DAG:         %[[TILE_1_2:.*]] = amdaie.tile(%c1, %c2)
+// CHECK-DAG:         %[[TILE_0_2:.*]] = amdaie.tile(%c0, %c2)
+// CHECK-DAG:         %[[TILE_0_1:.*]] = amdaie.tile(%c0, %c1)
+// CHECK-DAG:         %[[TILE_1_1:.*]] = amdaie.tile(%c1, %c1)
+// CHECK-DAG:         %[[TILE_1_0:.*]] = amdaie.tile(%c1, %c0)
+// CHECK-DAG:         %[[TILE_0_0:.*]] = amdaie.tile(%c0, %c0)
+// CHECK-DAG:         %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_1]]}
+// CHECK-DAG:         %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_1]]}
+// CHECK-DAG:         %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_0_1]]}
+// CHECK-DAG:         %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_1_1]]}
+// CHECK-DAG:         %[[FROM_MEMREF_4:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_6]], {%[[TILE_0_1]]}
+// CHECK-DAG:         %[[FROM_MEMREF_5:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_2]]}
+// CHECK-DAG:         %[[FROM_MEMREF_6:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]}
+// CHECK-DAG:         %[[FROM_MEMREF_7:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_2]], %[[TILE_1_2]]}
+// CHECK-DAG:         %[[FROM_MEMREF_8:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_1_2]]}
+// CHECK-DAG:         %[[FROM_MEMREF_9:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_0_2]]}
+// CHECK-DAG:         %[[FROM_MEMREF_10:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_1_0]]}
+// CHECK-DAG:         %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_0_0]]}
+// CHECK-DAG:         %[[FROM_MEMREF_12:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_0_0]]}
+// CHECK-DAG:         %[[FROM_MEMREF_13:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_1_0]]}
+// CHECK-DAG:         %[[FROM_MEMREF_14:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_0_0]]}
+// CHECK-DAG:         %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_0]]
+// CHECK-SAME:        %[[FROM_MEMREF_12]]
+// CHECK-DAG:         %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_7]]
+// CHECK-SAME:        %[[FROM_MEMREF_0]]
+// CHECK-DAG:         %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]]
+// CHECK-SAME:        %[[FROM_MEMREF_14]]
+// CHECK-DAG:         %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]]
+// CHECK-SAME:        %[[FROM_MEMREF_2]]
+// CHECK-DAG:         %[[DMA_4:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]]
+// CHECK-SAME:        %[[FROM_MEMREF_6]]
+// CHECK-DAG:         %[[DMA_5:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_11]]
+// CHECK-SAME:        %[[FROM_MEMREF_4]]
+// CHECK-DAG:         %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]])
+// CHECK-DAG:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Read)
+// CHECK-DAG:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_9]], Read)
+// CHECK-DAG:           %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_6]], Write)
+// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_3]])
+// CHECK-DAG:           amdaie.logicalobjectfifo.produce(%[[DMA_4]])
+// CHECK-DAG:         %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]]
+// CHECK-SAME:        %[[FROM_MEMREF_13]]
+// CHECK-DAG:         %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]]
+// CHECK-SAME:        %[[FROM_MEMREF_1]]
+// CHECK-DAG:         %[[DMA_8:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]]
+// CHECK-SAME:        %[[FROM_MEMREF_5]]
+// CHECK-DAG:         %[[DMA_9:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_10]]
+// CHECK-SAME:        %[[FROM_MEMREF_3]]
+// CHECK-DAG:         %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]])
+// CHECK-DAG:           %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Read)
+// CHECK-DAG:           %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_8]], Read)
+// CHECK-DAG:           %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write)
+// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_1]])
+// CHECK-DAG:           amdaie.logicalobjectfifo.consume(%[[DMA_7]])
+// CHECK-DAG:           amdaie.logicalobjectfifo.produce(%[[DMA_8]])
 #map = affine_map<(d0) -> (d0 * 32)>
 #map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)>
 #map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)>
@@ -558,44 +705,39 @@ module {
     %alloc_3 = memref.alloc() : memref<64x32xi32, 1>
     %alloc_4 = memref.alloc() : memref<32x64xi32, 1>
     scf.forall (%arg0, %arg1) in (1, 1) {
-      amdaie.workgroup {
-        %3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<32x64xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 1>>
-        %4 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<64x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<64x32xi32, 1>>
-        %5 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>
-        %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>
-        %7 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<8x8x4x8xi32, 2>>
-        %8 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<4x8x8x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<4x8x8x8xi32, 2>>
-        %9 = amdaie.logicalobjectfifo.from_memref %2, {} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<32x64xi32>>
-        %10 = amdaie.logicalobjectfifo.from_memref %1, {} : memref<32x1024xi32> -> !amdaie.logicalobjectfifo<memref<32x1024xi32>>
-        %11 = amdaie.logicalobjectfifo.from_memref %0, {} : memref<1024x64xi32> -> !amdaie.logicalobjectfifo<memref<1024x64xi32>>
-        scf.forall (%arg2, %arg3) in (1, 2) {
-          %12 = affine.apply #map(%arg2)
-          %13 = affine.apply #map(%arg3)
-          %14 = amdaie.dma_cpy_nd(%3[] [] [], %10[%12, %c960] [%c32, %c64] [%c1024, %c1]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 1>>, !amdaie.logicalobjectfifo<memref<32x1024xi32>>)
-          %15 = amdaie.dma_cpy_nd(%4[] [] [], %11[%c960, %13] [%c64, %c32] [%c64, %c1]) : (!amdaie.logicalobjectfifo<memref<64x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024x64xi32>>)
-          %16 = amdaie.dma_cpy_nd(%7[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %3[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<8x8x4x8xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x64xi32, 1>>)
-          %17 = amdaie.dma_cpy_nd(%8[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c512, %c64, %c8, %c1], %4[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c8, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<4x8x8x8xi32, 2>>, !amdaie.logicalobjectfifo<memref<64x32xi32, 1>>)
-          %18 = amdaie.dma_cpy_nd(%5[%c0, %c0] [%c32, %c32] [%c32, %c1], %6[%c0, %c0, %c0, %c0] [%c8, %c4, %c4, %c8] [%c32, %c8, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>)
-          %19 = amdaie.dma_cpy_nd(%9[%12, %13] [%c32, %c32] [%c64, %c1], %5[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32>>, !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>)
-          %20 = arith.addi %arg2, %c2 : index
-          %tile = amdaie.tile(%arg3, %20)
-          %21 = amdaie.core(%tile) {
-            amdaie.logicalobjectfifo.consume(%16)
-            amdaie.logicalobjectfifo.consume(%17)
-            linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) {
-            ^bb0(%in: i32, %in_5: i32, %out: i32):
-              %22 = arith.muli %in, %in_5 : i32
-              %23 = arith.addi %out, %22 : i32
-              linalg.yield %23 : i32
-            }
-            amdaie.logicalobjectfifo.produce(%18)
-            amdaie.end
+      %3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<32x64xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x64xi32, 1>>
+      %4 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<64x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<64x32xi32, 1>>
+      %5 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>
+      %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>
+      %7 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<8x8x4x8xi32, 2>>
+      %8 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<4x8x8x8xi32, 2> -> !amdaie.logicalobjectfifo<memref<4x8x8x8xi32, 2>>
+      %9 = amdaie.logicalobjectfifo.from_memref %2, {} : memref<32x64xi32> -> !amdaie.logicalobjectfifo<memref<32x64xi32>>
+      %10 = amdaie.logicalobjectfifo.from_memref %1, {} : memref<32x1024xi32> -> !amdaie.logicalobjectfifo<memref<32x1024xi32>>
+      %11 = amdaie.logicalobjectfifo.from_memref %0, {} : memref<1024x64xi32> -> !amdaie.logicalobjectfifo<memref<1024x64xi32>>
+      scf.forall (%arg2, %arg3) in (1, 2) {
+        %12 = affine.apply #map(%arg2)
+        %13 = affine.apply #map(%arg3)
+        %14 = amdaie.dma_cpy_nd(%3[] [] [], %10[%12, %c960] [%c32, %c64] [%c1024, %c1]) : (!amdaie.logicalobjectfifo<memref<32x64xi32, 1>>, !amdaie.logicalobjectfifo<memref<32x1024xi32>>)
+        %15 = amdaie.dma_cpy_nd(%4[] [] [], %11[%c960, %13] [%c64, %c32] [%c64, %c1]) : (!amdaie.logicalobjectfifo<memref<64x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<1024x64xi32>>)
+        %16 = amdaie.dma_cpy_nd(%7[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %3[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo<memref<8x8x4x8xi32, 2>>, !amdaie.logicalobjectfifo<memref<32x64xi32, 1>>)
+        %17 = amdaie.dma_cpy_nd(%8[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c512, %c64, %c8, %c1], %4[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c8, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo<memref<4x8x8x8xi32, 2>>, !amdaie.logicalobjectfifo<memref<64x32xi32, 1>>)
+        %18 = amdaie.dma_cpy_nd(%5[%c0, %c0] [%c32, %c32] [%c32, %c1], %6[%c0, %c0, %c0, %c0] [%c8, %c4, %c4, %c8] [%c32, %c8, %c256, %c1]) : (!amdaie.logicalobjectfifo<memref<32x32xi32, 1>>, !amdaie.logicalobjectfifo<memref<4x8x4x8xi32, 2>>)
+        %19 = amdaie.dma_cpy_nd(%9[%12, %13] [%c32, %c32] [%c64, %c1], %5[] [] []) : (!amdaie.logicalobjectfifo<memref<32x64xi32>>, !amdaie.logicalobjectfifo<memref<32x32xi32, 1>>)
+        %20 = arith.addi %arg2, %c2 : index
+        %tile = amdaie.tile(%arg3, %20)
+        %21 = amdaie.core(%tile) {
+          amdaie.logicalobjectfifo.consume(%16)
+          amdaie.logicalobjectfifo.consume(%17)
+          linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) {
+          ^bb0(%in: i32, %in_5: i32, %out: i32):
+            %22 = arith.muli %in, %in_5 : i32
+            %23 = arith.addi %out, %22 : i32
+            linalg.yield %23 : i32
           }
-        } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
-        amdaie.controlcode {
+          amdaie.logicalobjectfifo.produce(%18)
           amdaie.end
         }
-      }
+      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
     } {mapping = [#gpu.block<y>, #gpu.block<x>]}
     memref.dealloc %alloc_4 : memref<32x64xi32, 1>
     memref.dealloc %alloc_3 : memref<64x32xi32, 1>
diff --git a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc
index d522dd0d9..d721cef5f 100644
--- a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc
+++ b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc
@@ -326,13 +326,19 @@ static iree_status_t iree_hal_xrt_direct_command_buffer_dispatch(
 
   xrt::run run = xrt::run(kernel);
 
+  // set opcode for transaction binary execution
+  unsigned int opcode = 3;
+
   // Index to push arguments on the kernel.
   iree_host_size_t arg_index = 0;
 
-  // First argument is the LX6 instructions.
+  // First argument is the opcode.
+  run.set_arg(arg_index++, opcode);
+
+  // Second argument is the LX6 instructions.
   run.set_arg(arg_index++, instr);
 
-  // Second argument is the number of LX6 instructions.
+  // Third argument is the number of LX6 instructions.
   run.set_arg(arg_index++, num_instr);
 
   // Copy descriptors from all sets to the end of the current segment for later
diff --git a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc
index f7d75be6a..d572dbe4c 100644
--- a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc
+++ b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc
@@ -209,11 +209,11 @@ iree_status_t iree_hal_xrt_native_executable_create(
           std::make_unique<xrt::kernel>(contexts[xclbin_index], entry_name);
       // XCL_BO_FLAGS_CACHEABLE is used to indicate that this is an instruction
       // buffer that resides in instr_memory. This buffer is always passed as
-      // the first argument to the kernel and we can use the
-      // kernel.group_id(/*index of first argument*/=0) to get the group_id.
+      // the second argument to the kernel and we can use the
+      // kernel.group_id(/*index of second argument*/=1) to get the group_id.
       instr = std::make_unique<xrt::bo>(device, num_instr * sizeof(uint32_t),
                                         XCL_BO_FLAGS_CACHEABLE,
-                                        kernel.get()->group_id(0));
+                                        kernel.get()->group_id(1));
     } catch (...) {
       iree_hal_executable_destroy((iree_hal_executable_t*)executable);
       IREE_TRACE_ZONE_END(z0);
diff --git a/third_party/mlir-aie b/third_party/mlir-aie
index 6f70bfe49..3ac9566f1 160000
--- a/third_party/mlir-aie
+++ b/third_party/mlir-aie
@@ -1 +1 @@
-Subproject commit 6f70bfe4904ec719042d7ebd8ded9ad8b31bb5b6
+Subproject commit 3ac9566f1da7c4ee6e81c263bc15d92aba7bcae7
diff --git a/third_party/mlir-air b/third_party/mlir-air
index 766f50c77..b2df4d74a 160000
--- a/third_party/mlir-air
+++ b/third_party/mlir-air
@@ -1 +1 @@
-Subproject commit 766f50c7768dc9a12bb933f9ed45014a889d106e
+Subproject commit b2df4d74a77e6d7be327e75802098eb96b5c9a35