diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index c7f6b2cb2..4e9bbe923 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -103,7 +103,7 @@ jobs: run: | python3 -m venv .venv source .venv/bin/activate - pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024060222+6f70bfe-py3-none-manylinux_2_35_x86_64.whl + pip install https://github.com/Xilinx/mlir-aie/releases/download/latest-wheels/mlir_aie-0.0.1.2024061222+3ac9566-py3-none-manylinux_2_35_x86_64.whl pip install -r tests/matmul/requirements.txt diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index 0765348de..98d451c4a 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -638,7 +638,7 @@ run_matmul_test \ --lhs_rhs_type "bf16" \ --acc_type "f32" \ --m "64" --n "64" --k "128" \ - --expect_compile_failure "1" + --num_repeat_runs "0" run_matmul_test \ --name_prefix "packPeelLarge" \ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td index 7d9b7e70f..1ad7d9506 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEAttrs.td @@ -41,6 +41,18 @@ def LogicalObjectFifoPort: I32EnumAttr<"LogicalObjectFifoPort", let cppNamespace = "mlir::iree_compiler::AMDAIE"; } +def MemoryAccess: I32EnumAttr<"MemoryAccess", + "The memory access type", + [ + I32EnumAttrCase<"None", 0>, + I32EnumAttrCase<"Read", 1>, + I32EnumAttrCase<"Write", 2>, + I32EnumAttrCase<"Any", 3>, + ] + > { + let cppNamespace = "mlir::iree_compiler::AMDAIE"; +} + def AMDAIE_MemSpace_Global : I32EnumAttrCase<"Global", 0>; def AMDAIE_MemSpace_Shared : I32EnumAttrCase<"Shared", 1>; def AMDAIE_MemSpace_Local : I32EnumAttrCase<"Local", 2>; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp index e4c975e68..b56bbc31d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.cpp @@ -328,6 +328,22 @@ LogicalObjectFifoFromMemrefOp CircularDmaCpyNdOp::getTargetObjectFifo() { return dyn_cast(getTarget().getDefiningOp()); }; +//===----------------------------------------------------------------------===// +// AMDAIE_LogicalObjectFifoAccessOp +//===----------------------------------------------------------------------===// + +void LogicalObjectFifoAccessOp::build(OpBuilder &b, + mlir::OperationState &result, Value input, + MemoryAccess accessType) { + auto type = llvm::cast(input.getType()); + build(b, result, type.getElementType(), input, accessType); +} + +LogicalObjectFifoFromMemrefOp +LogicalObjectFifoAccessOp::getLogicalObjectFifo() { + return dyn_cast(getInput().getDefiningOp()); +}; + //===----------------------------------------------------------------------===// // AMDAIE_LogicalObjectFifoAcquire //===----------------------------------------------------------------------===// @@ -341,6 +357,26 @@ void LogicalObjectFifoAcquire::build(OpBuilder &b, mlir::OperationState &result, // AMDAIE_LogicalObjectFifoFromMemrefOp //===----------------------------------------------------------------------===// +/// Build with an array of static tile locations. +void LogicalObjectFifoFromMemrefOp::build( + OpBuilder &b, mlir::OperationState &result, Value memref, + ArrayRef> tileLocations) { + SmallVector tiles; + tiles.reserve(tileLocations.size()); + for (auto [column, row] : tileLocations) { + auto colIndex = b.create(b.getUnknownLoc(), column); + auto rowIndex = b.create(b.getUnknownLoc(), row); + auto tileOp = + b.create(b.getUnknownLoc(), colIndex, rowIndex); + tiles.push_back(tileOp.getResult()); + } + // For deterministic order. + llvm::sort(tiles.begin(), tiles.end(), + TileOp::tileValueColumnAndRowComparator); + auto type = LogicalObjectFifoType::get(cast(memref.getType())); + build(b, result, type, memref, tiles); +} + LogicalResult LogicalObjectFifoFromMemrefOp::canonicalize( LogicalObjectFifoFromMemrefOp logicalObjectFifo, PatternRewriter &rewriter) { @@ -349,23 +385,19 @@ LogicalResult LogicalObjectFifoFromMemrefOp::canonicalize( return success(); } - auto comparator = [](Value a, Value b) -> bool { - TileOp tileA = dyn_cast(a.getDefiningOp()); - TileOp tileB = dyn_cast(b.getDefiningOp()); - int64_t colA = getConstantIntValue(tileA.getCol()).value(); - int64_t rowA = getConstantIntValue(tileA.getRow()).value(); - int64_t colB = getConstantIntValue(tileB.getCol()).value(); - int64_t rowB = getConstantIntValue(tileB.getRow()).value(); - if (colA == colB) return rowA < rowB; - return colA < colB; - }; SmallVector tiles = logicalObjectFifo.getTiles(); - if (llvm::is_sorted(tiles, comparator)) { + if (llvm::is_sorted(tiles, TileOp::tileValueColumnAndRowComparator)) { + // Still erase duplicates. + tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end()); return success(); } - // If tiles are not sorted, sort them and replace the logical objectfifo - llvm::sort(tiles.begin(), tiles.end(), comparator); + // If tiles are not sorted, sort them, erase duplicates and replace the + // logical objectfifo. + llvm::sort(tiles.begin(), tiles.end(), + TileOp::tileValueColumnAndRowComparator); + tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end()); + rewriter.replaceOpWithNewOp( logicalObjectFifo, llvm::cast( @@ -532,6 +564,23 @@ bool TileOp::hasStaticLocation() { return getConstantIntValue(getCol()) && getConstantIntValue(getRow()); } +bool TileOp::tileColumnComparator(AMDAIE::TileOp &a, AMDAIE::TileOp &b) { + int64_t colA = getConstantIntValue(a.getCol()).value(); + int64_t colB = getConstantIntValue(b.getCol()).value(); + return colA < colB; +} + +bool TileOp::tileValueColumnAndRowComparator(Value a, Value b) { + TileOp tileA = dyn_cast(a.getDefiningOp()); + TileOp tileB = dyn_cast(b.getDefiningOp()); + int64_t colA = getConstantIntValue(tileA.getCol()).value(); + int64_t rowA = getConstantIntValue(tileA.getRow()).value(); + int64_t colB = getConstantIntValue(tileB.getCol()).value(); + int64_t rowB = getConstantIntValue(tileB.getRow()).value(); + if (colA == colB) return rowA < rowB; + return colA < colB; +}; + //===----------------------------------------------------------------------===// // AMDAIE_WorkgroupOp //===----------------------------------------------------------------------===// diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td index b565212a0..083b94db6 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/AMDAIEOps.td @@ -110,6 +110,11 @@ def AMDAIE_TileOp: AMDAIE_Op<"tile", [ let extraClassDeclaration = [{ bool hasStaticLocation(); + // Comparator for `amdaie.tile` based on column index. + static bool tileColumnComparator(AMDAIE::TileOp &a, AMDAIE::TileOp &b); + // Comparator for `amdaie.tile` values based on column index first and then + // row index. + static bool tileValueColumnAndRowComparator(Value a, Value b); }]; } @@ -319,6 +324,53 @@ def AMDAIE_NpuDmaWaitOp: AMDAIE_Op<"npu.dma_wait", []> { // IREE AMDAIE LogicalObjectFifo Ops //===----------------------------------------------------------------------===// +def AMDAIE_LogicalObjectFifoAccessOp : AMDAIE_Op<"logicalobjectfifo.access"> { + let summary = "Operation to access the encapsulated memref from a logical" + "objectFifo."; + let description = [{ + Returns the encapsulated memref from a logical objectFifo. This is meant to + be used within `amdaie.core` operations to access and operate on the memref. + Has a memory `access_type` argument that indicates the type of access being + done. This can be used to generate a correct (semaphore) synchronization + scheme to access the logical objectFifo's content. + + Example: + ```mlir + %tile = amdaie.tile(%c1, %c3) + %alloc = memref.alloc() : memref<8x16xi32, 2> + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {%tile} : memref<8x16xi32, 2> + -> !amdaie.logicalobjectfifo> + %core = amdaie.core(%tile) { + %1 = amdaie.logicalobjectfifo.access(%0, Read) : + !amdaie.logicalobjectfifo> -> memref<8x16xi32, 2> + ``` + }]; + + let arguments = ( + ins AnyAMDAIELogicalObjectFifoType:$input, + MemoryAccess:$access_type + ); + + let results = (outs AnyMemRef:$output); + + let assemblyFormat = [{ + `(` $input `,` $access_type `)` attr-dict `:` type($input) `->` type($output) + }]; + + let builders = [ + // Build a LogicalObjectFifoAccessOp with a logicalObjectFifo value and access + // type. + OpBuilder<(ins "mlir::Value":$input, "MemoryAccess":$access_type)> + ]; + + let extraClassDeclaration = [{ + LogicalObjectFifoFromMemrefOp getLogicalObjectFifo(); + }]; + + // let hasVerifier = 1; + let cppNamespace = "mlir::iree_compiler::AMDAIE"; +} + def AMDAIE_LogicalObjectFifoAcquire: AMDAIE_Op<"logicalobjectfifo.acquire", []> { let summary = "Semaphore operation to acquire objects from a logical" @@ -430,7 +482,12 @@ def AMDAIE_LogicalObjectFifoFromMemrefOp // Build a LogicalObjectFifoFromMemrefOp with just a memref value. let builders = [ - OpBuilder<(ins "mlir::Value":$memref)> + OpBuilder<(ins "mlir::Value":$memref)>, + // Build `LogicalObjectFifoFromMemrefOp` with an array of static tile + // locations. + OpBuilder< + (ins "mlir::Value":$memref, + "::llvm::ArrayRef>":$tileLocations)> ]; let extraClassDeclaration = [{ diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir index bedaf8c08..670e303a2 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/IR/test/roundtrip.mlir @@ -116,6 +116,15 @@ func.func @dma_cpy_nd_mixed(%arg0: !amdaie.logicalobjectfifo>) { + %0 = amdaie.logicalobjectfifo.access(%arg0, Write) : !amdaie.logicalobjectfifo> -> memref<1x1x8x16xi32, 2 : i32> + return +} + +// ----- + // CHECK-LABEL: func.func @logicalobjectfifo_acquire // CHECK: %[[DMA:.+]] = amdaie.dma_cpy_nd // CHECK: amdaie.logicalobjectfifo.acquire diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp index 170e9ea30..84c0da3e8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDistributeCoresAndObjectFifos.cpp @@ -9,6 +9,7 @@ #include "iree-amd-aie/Transforms/Passes.h" #include "iree-amd-aie/Transforms/Transforms.h" #include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h" +#include "mlir/Dialect/Linalg/IR/Linalg.h" #include "mlir/Dialect/SCF/Transforms/Transforms.h" #include "mlir/IR/Iterators.h" #include "mlir/Pass/Pass.h" @@ -24,6 +25,144 @@ static const llvm::StringLiteral kAMDAIELoopUnroll = "amdaie.unroll"; namespace { +//===----------------------------------------------------------------------===// +// Utilities +//===----------------------------------------------------------------------===// + +/// Comparator for a pair of `amdaie.dma_cpy_nd` on the first tile operation's +/// column index. +bool dmaColComparator( + std::pair> &a, + std::pair> &b) { + return TileOp::tileColumnComparator(a.second[0], b.second[0]); +}; + +/// Utility to use tuple coordinates as key of a `DenseMap`. +struct LocationMapInfo { + static SmallVector> getEmptyKey() { + return {std::make_pair(int64_t(-1), int64_t(-1))}; + } + + static SmallVector> getTombstoneKey() { + return {std::make_pair(int64_t(-2), int64_t(-2))}; + } + + static unsigned getHashValue( + const SmallVector> &v) { + return static_cast(llvm::hash_combine_range(v.begin(), v.end())); + } + + static bool isEqual(const SmallVector> &lhs, + const SmallVector> &rhs) { + return lhs == rhs; + } +}; + +//===----------------------------------------------------------------------===// +// AMDAIEDistributeCoresAndObjectFifosPass +//===----------------------------------------------------------------------===// + +/// Distribute local memory accesses through subviews by allocating a single +/// smaller memory. This is needed because cores can't operate on one larger L1 +/// memory. +LogicalResult distributeLocalMemory(ModuleOp moduleOp) { + IRRewriter rewriter(moduleOp.getContext()); + SmallVector toBeErased; + // Map from alloc operations to a new alloc operations to be used. + DenseMap memrefToNew; + + moduleOp->walk([&](memref::AllocOp allocOp) { + // Only consider local memory (L1). + Attribute memSpace = + cast(allocOp.getResult().getType()).getMemorySpace(); + if (!memSpace || dyn_cast(memSpace).getInt() != 2) + return WalkResult::advance(); + + LLVM_DEBUG(llvm::dbgs() + << "DistributeLocalMemory for: " << allocOp << "\n"); + + SmallVector dmaUsers; + for (Operation *userOp : allocOp->getUsers()) { + if (auto logicalObjectFifo = + dyn_cast(userOp)) { + for (Operation *objFifoUserOp : logicalObjectFifo->getUsers()) { + if (auto dmaOp = dyn_cast(objFifoUserOp); + dmaOp.getSourceObjectFifo() == logicalObjectFifo) { + dmaUsers.push_back(dmaOp); + } + } + } + } + if (dmaUsers.empty()) return WalkResult::advance(); + LLVM_DEBUG(llvm::dbgs() << "DMA users: " << dmaUsers.size() << "\n"); + + for (Operation *userOp : allocOp->getUsers()) { + auto subviewOp = dyn_cast(userOp); + if (!subviewOp) continue; + + if (!memrefToNew.contains(allocOp)) { + LLVM_DEBUG(llvm::dbgs() << "Create new allocate\n"); + rewriter.setInsertionPoint(allocOp); + auto memRefType = cast(subviewOp.getResult().getType()); + MemRefType allocType = MemRefType::get( + memRefType.getShape(), memRefType.getElementType(), + MemRefLayoutAttrInterface{}, memRefType.getMemorySpace()); + auto newAllocOp = rewriter.create( + rewriter.getUnknownLoc(), allocType); + auto newDeallocOp = rewriter.create( + rewriter.getUnknownLoc(), newAllocOp); + newDeallocOp->moveBefore(&newAllocOp->getBlock()->back()); + memrefToNew[allocOp] = newAllocOp; + } + auto newAlloc = memrefToNew[allocOp]; + rewriter.replaceAllUsesWith(subviewOp, newAlloc); + toBeErased.push_back(subviewOp); + } + + // Update the alloc's DMA users. + if (memrefToNew.contains(allocOp)) { + LLVM_DEBUG(llvm::dbgs() + << "Update allocate DMA users: " << dmaUsers.size() << "\n"); + auto newAlloc = memrefToNew[allocOp]; + auto type = cast(newAlloc.getType()); + for (AMDAIE::DmaCpyNdOp dmaOp : dmaUsers) { + SmallVector empty; + rewriter.setInsertionPoint(dmaOp.getSourceObjectFifo()); + auto source = rewriter.create( + rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type), + newAlloc.getResult()); + rewriter.replaceOp(dmaOp.getSourceObjectFifo(), source); + rewriter.setInsertionPoint(dmaOp); + auto newDmaOp = rewriter.create( + dmaOp.getLoc(), dmaOp.getTarget(), dmaOp.getTargetOffsets(), + dmaOp.getTargetSizes(), dmaOp.getTargetStrides(), source, + dmaOp.getSourceOffsets(), dmaOp.getSourceSizes(), + dmaOp.getSourceStrides()); + rewriter.replaceOp(dmaOp, newDmaOp); + } + + // Insert dealloc + memref::DeallocOp deallocOp; + for (Operation *userOp : allocOp->getUsers()) { + if (auto deallocUser = dyn_cast(userOp)) { + deallocOp = deallocUser; + } + } + if (deallocOp) { + toBeErased.push_back(deallocOp); + } + toBeErased.push_back(allocOp); + } + return WalkResult::advance(); + }); + + for (auto *op : toBeErased) { + op->dropAllUses(); + rewriter.eraseOp(op); + } + return success(); +} + /// Convert inner scf.forall ops chosen for parallel distribution to scf.for /// ops. LogicalResult localForallToFor(ModuleOp moduleOp) { @@ -192,39 +331,44 @@ class AMDAIEUnrollLocalLoops : public OpRewritePattern { rewriter.create(rewriter.getUnknownLoc(), 1)); // Iterate through the loop and create body - IRMapping operandMap; for (auto i = lbInt + stepInt; i < ubInt; i += stepInt) { + IRMapping operandMap; Value ivUnroll = builder.create(builder.getUnknownLoc(), i); if (!forOpIV.use_empty()) { operandMap.map(forOpIV, ivUnroll); } - // Iterate through body and clone ops + // Iterate through body and map internal logical objectfifos to new ones + // and fill operand map. for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); it++) { if (auto dmaOp = dyn_cast(*it)) { AMDAIE::LogicalObjectFifoFromMemrefOp source = dmaOp.getSourceObjectFifo(); + uint64_t sourceMemSpaceInt = source.getMemorySpaceAsUInt(); AMDAIE::LogicalObjectFifoFromMemrefOp target = dmaOp.getTargetObjectFifo(); - if (!operandMap.contains(source.getOutput())) { - rewriter.setInsertionPoint(source); - auto cloneOp = dyn_cast( - rewriter.clone(*dmaOp.getSource().getDefiningOp())); - operandMap.map(source.getOutput(), cloneOp.getOutput()); - } - if (!operandMap.contains(target.getOutput())) { + uint64_t targetMemSpaceInt = target.getMemorySpaceAsUInt(); + if (targetMemSpaceInt > sourceMemSpaceInt) { rewriter.setInsertionPoint(target); auto cloneOp = dyn_cast( rewriter.clone(*dmaOp.getTarget().getDefiningOp())); operandMap.map(target.getOutput(), cloneOp.getOutput()); + } else if (sourceMemSpaceInt > targetMemSpaceInt) { + rewriter.setInsertionPoint(source); + auto cloneOp = dyn_cast( + rewriter.clone(*dmaOp.getSource().getDefiningOp())); + operandMap.map(source.getOutput(), cloneOp.getOutput()); } - builder.clone(*it, operandMap); - } else { - builder.clone(*it, operandMap); } } + + // Iterate through body and clone ops + for (auto it = loopBodyBlock->begin(); it != std::next(srcBlockEnd); + it++) { + builder.clone(*it, operandMap); + } } return success(); } @@ -265,65 +409,6 @@ class AMDAIEUnrollLocalLoops : public OpRewritePattern { } }; -/// Assign tiles to the logical objectfifos with local memory space (L1). -/// The tiles are derived from the usage of the logical objectfifos within -/// core operations, which are already assigned a tile location. -LogicalResult assignLocalAieTiles(ModuleOp moduleOp) { - IRRewriter rewriter(moduleOp.getContext()); - - // Map from local objectfifos found to the tiles where they are used - DenseMap> - logicalObjectFifosToTiles; - - // Utility function insert a local objectfifo - tile pair into the local - // objectfifo to tile map - auto insertTile = [&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, - Value tileResult) -> void { - if (!logicalObjectFifosToTiles.contains(logicalObjectFifo)) { - logicalObjectFifosToTiles[logicalObjectFifo] = {}; - } - logicalObjectFifosToTiles[logicalObjectFifo].insert(tileResult); - }; - - // Walk DMA ops and find the ones which are used in cores to update - // source/target logical objectfifos - moduleOp->walk([&](AMDAIE::DmaCpyNdOp dmaOp) { - for (Operation *userOp : dmaOp->getUsers()) { - if (auto coreOp = userOp->getParentOfType()) { - Attribute sourceMemspace = dmaOp.getSourceObjectFifo().getMemorySpace(); - Attribute targetMemspace = dmaOp.getTargetObjectFifo().getMemorySpace(); - if (sourceMemspace && - dyn_cast(sourceMemspace).getInt() == 2) { - // Source on L1 - insertTile(dmaOp.getSourceObjectFifo(), - coreOp.getTileOp().getResult()); - } else if (targetMemspace && - dyn_cast(targetMemspace).getInt() == 2) { - // Target on L1 - insertTile(dmaOp.getTargetObjectFifo(), - coreOp.getTileOp().getResult()); - } - - // Move tile to beginning of parent block. - rewriter.moveOpBefore(coreOp.getTileOp(), coreOp->getBlock(), - coreOp->getBlock()->begin()); - } - } - return WalkResult::advance(); - }); - - // Update the logical objectfifos with assigned tiles - for (auto &&[logicalObjectFifo, tiles] : logicalObjectFifosToTiles) { - rewriter.setInsertionPoint(logicalObjectFifo); - rewriter.replaceOpWithNewOp( - logicalObjectFifo, - cast(logicalObjectFifo.getOutput().getType()), - logicalObjectFifo.getMemref(), tiles.takeVector()); - } - return success(); -} - /// Return the tiles of the sources respectively targets of the users of this /// logical objectfifo, depending on whether the OperateOn template parameter is /// set to `OperateOn::Source` respectively `OperateOn::Target`. @@ -353,19 +438,172 @@ LogicalResult getUserTiles( return success(); } -/// Assign logical objectfifos to physical AIE tiles. This rewrite takes an -/// iterative approach by matching logical objectfifos and only assigning tiles -/// when linked through dma ops with other logical objectfifos which already -/// have tiles assigned. If the linked logical objectfifos don't have tiles -/// assigned yet, we will return a failure and give the linked logical -/// objectfifos a chance to assign tiles before returning to this one. +/// Insert `amdaie.logicalobjectfifo.access` operations which retrieve the +/// memrefs from logical objectfifos and update the computational operations to +/// operate on these local memrefs. +LogicalResult insertLogicalObjectFifoAccess(ModuleOp moduleOp) { + IRRewriter rewriter(moduleOp.getContext()); + + SmallVector coreOps; + moduleOp->walk([&](AMDAIE::CoreOp coreOp) { coreOps.push_back(coreOp); }); + + for (AMDAIE::CoreOp coreOp : coreOps) { + DenseMap> + memrefToLogicalObjectFifo; + // First walk to collect consume/produce DMA accesses and map respective + // memrefs to logical objectifos. + coreOp->walk([&](Operation *op) { + // TODO(jornt): can we avoid produce/consume? + if (auto consumeOp = dyn_cast(op)) { + Value targetMemref = + consumeOp.getDmaCpyNdOp().getTargetObjectFifo().getMemref(); + memrefToLogicalObjectFifo[targetMemref] = + std::make_pair(consumeOp.getDmaCpyNdOp().getTargetObjectFifo(), + AMDAIE::MemoryAccess::Read); + } else if (auto produceOp = + dyn_cast(op)) { + Value sourceMemref = + produceOp.getDmaCpyNdOp().getSourceObjectFifo().getMemref(); + memrefToLogicalObjectFifo[sourceMemref] = + std::make_pair(produceOp.getDmaCpyNdOp().getSourceObjectFifo(), + AMDAIE::MemoryAccess::Write); + } + }); + + WalkResult res = coreOp->walk([&](Operation *op) { + if (auto linalgOp = dyn_cast(op)) { + for (auto &&[idx, operand] : + llvm::enumerate(linalgOp->getOpOperands())) { + if (memrefToLogicalObjectFifo.contains(operand.get())) { + rewriter.setInsertionPointToStart(coreOp.getBody()); + std::tuple + value = memrefToLogicalObjectFifo[operand.get()]; + rewriter.create( + rewriter.getUnknownLoc(), std::get<0>(value), + std::get<1>(value)); + // TODO(jornt): Temporary, enable after access operations are used + // for inserting synchronization stubs instead of consume/produce. + // linalgOp->setOperand(idx, accessOp); + } else if (auto type = + llvm::dyn_cast(operand.get().getType())) { + Value memref = operand.get(); + rewriter.setInsertionPoint(coreOp); + auto logicalObjectFifo = + rewriter.create( + rewriter.getUnknownLoc(), LogicalObjectFifoType::get(type), + memref); + rewriter.setInsertionPointToStart(coreOp.getBody()); + rewriter.create( + rewriter.getUnknownLoc(), logicalObjectFifo, + AMDAIE::MemoryAccess::None); + // TODO(jornt): Temporary, enable after access operations are used + // for inserting synchronization stubs instead of consume/produce. + // linalgOp->setOperand(idx, accessOp); + } + } + } + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + } + return success(); +} + +/// Utility to recursively find users of the provided logical objectFifo inside +/// `amdaie.core` operations and return the tile coordinates. +LogicalResult findUsersInCoreAndAddTiles( + Operation *op, AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, + llvm::SmallSetVector, 16> &tiles) { + for (Operation *userOp : op->getUsers()) { + if (auto coreOp = userOp->getParentOfType()) { + AMDAIE::TileOp tileOp = coreOp.getTileOp(); + std::optional column = getConstantIntValue(tileOp.getCol()); + std::optional row = getConstantIntValue(tileOp.getRow()); + if (!column || !row) { + return coreOp.emitOpError() << "has non-constant tile location"; + } + tiles.insert(std::make_pair(column.value(), row.value())); + } + if (auto subviewOp = dyn_cast(userOp)) { + return findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, tiles); + } else if (auto userLogicalObjectFifo = + dyn_cast(userOp)) { + return findUsersInCoreAndAddTiles(userLogicalObjectFifo, + logicalObjectFifo, tiles); + } + } + return success(); +} + +/// Assign tiles to the logical objectfifos with local memory space (L1). +/// The tiles are derived from the usage of the logical objectfifos within +/// core operations, which are already assigned a tile location. +LogicalResult assignLocalAieTiles(ModuleOp moduleOp) { + IRRewriter rewriter(moduleOp.getContext()); + + WalkResult res = moduleOp->walk( + [&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { + Attribute memSpace = logicalObjectFifo.getMemorySpace(); + if (!memSpace || dyn_cast(memSpace).getInt() != 2) + return WalkResult::advance(); + + llvm::SmallSetVector, 16> tileLocations; + if (failed(findUsersInCoreAndAddTiles( + logicalObjectFifo, logicalObjectFifo, tileLocations))) { + return WalkResult::interrupt(); + } + // Handle subviews. + for (Operation *userOp : + logicalObjectFifo.getMemref().getDefiningOp()->getUsers()) { + if (auto subviewOp = dyn_cast(userOp)) { + if (failed(findUsersInCoreAndAddTiles(subviewOp, logicalObjectFifo, + tileLocations))) { + return WalkResult::interrupt(); + } + } + } + + SmallVector tiles; + tiles.reserve(tileLocations.size()); + rewriter.setInsertionPoint(logicalObjectFifo); + for (auto [column, row] : tileLocations) { + auto colIndex = rewriter.create( + rewriter.getUnknownLoc(), column); + auto rowIndex = rewriter.create( + rewriter.getUnknownLoc(), row); + auto tileOp = rewriter.create( + rewriter.getUnknownLoc(), colIndex, rowIndex); + tiles.push_back(tileOp.getResult()); + } + // Sort for deterministic output IR. + llvm::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileValueColumnAndRowComparator); + rewriter.replaceOpWithNewOp( + logicalObjectFifo, + cast( + logicalObjectFifo.getOutput().getType()), + logicalObjectFifo.getMemref(), tiles); + return WalkResult::advance(); + }); + if (res.wasInterrupted()) return failure(); + return success(); +} + +/// Assign a set of potential physical AIE tiles to logical objectFifos. This +/// rewrite takes an iterative approach by matching logical objectfifos and only +/// assigning tiles when linked through dma ops with other logical objectfifos +/// which already have tiles assigned. If the linked logical objectfifos don't +/// have tiles assigned yet, we will return a failure and give the linked +/// logical objectfifos a chance to assign tiles before returning to this one. /// -/// TODO(jornt): There are decisions being made in this pass on which tile to +/// TODO(jornt): There are decisions being made in this pass on which tiles to /// assign to a logical objectfifo. This logic is very simple for now and tries -/// to use the leftmost available column. At some point, we probably need some -/// AIE device model to guide the assignement here for performance and to avoid -/// resource issues down below. -class AssignAieTiles +/// to use the tiles in the same columns as targets and sources. At some point, +/// we probably need some AIE device model to guide the assignement here for +/// performance and to avoid hardware resource issues later on. +class FillAieTiles : public OpRewritePattern { using OpRewritePattern< AMDAIE::LogicalObjectFifoFromMemrefOp>::OpRewritePattern; @@ -373,6 +611,7 @@ class AssignAieTiles LogicalResult matchAndRewrite( AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo, PatternRewriter &rewriter) const override { + LLVM_DEBUG(llvm::dbgs() << "FillAieTiles: " << logicalObjectFifo << "\n"); if (!logicalObjectFifo.getTiles().empty()) { return failure(); } @@ -388,88 +627,246 @@ class AssignAieTiles } return failure(); } + // HandLe both L3/shim and L2/Memtiles. + // Skip logical objectfifos within non-global and non-shared memory. + if (memSpace && dyn_cast(memSpace).getInt() != 1) { + return logicalObjectFifo.emitOpError() + << "found logical objectfifo with unknown memory space"; + } + + SmallVector targetTiles; + SmallVector sourceTiles; + LogicalResult dstRes = + getUserTiles(logicalObjectFifo, targetTiles); + LogicalResult srcRes = + getUserTiles(logicalObjectFifo, sourceTiles); + + // If no source and target tiles found, skip. + if (failed(dstRes) && failed(srcRes)) { + return failure(); + } - SmallVector tileResults; - if (!memSpace || dyn_cast(memSpace).getInt() == 1) { - // HandLe both L3/shim and L2/Memtiles. Try to use memtiles in the same - // column as the AIE tiles where the data needs to go to. - - // TODO(jornt): avoid row hardcoding. Will need to update the mlir-aie - // target model for this. - int rowInt = memSpace ? 1 : 0; - Value row = rewriter.create( - rewriter.getUnknownLoc(), rowInt); - - SmallVector targetTiles; - SmallVector sourceTiles; - LogicalResult dstRes = - getUserTiles(logicalObjectFifo, targetTiles); - LogicalResult srcRes = - getUserTiles(logicalObjectFifo, sourceTiles); - - // If no source and target tiles found, skip. - if (failed(dstRes) && failed(srcRes)) { - return failure(); + // TODO(jornt): avoid row hardcoding. Will need to update the mlir-aie + // target model for this. + int64_t rowInt = memSpace ? 1 : 0; + llvm::SmallSetVector, 16> tileLocations; + auto createTileLocations = + [&](SmallVector &tiles) -> LogicalResult { + // TODO(jornt): For now, for deterministic behaviour, sort on column + // index and use first one. This needs to be generalized to assign + // tiles based on a resource model. + std::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileColumnComparator); + // Erase duplicates. + tiles.erase(std::unique(tiles.begin(), tiles.end()), tiles.end()); + for (AMDAIE::TileOp tile : tiles) { + std::optional column = getConstantIntValue(tile.getCol()); + if (!column) return tile.emitOpError() << "found non-constant column"; + tileLocations.insert(std::make_pair(column.value(), rowInt)); } + return success(); + }; - auto colComparator = [](AMDAIE::TileOp &a, AMDAIE::TileOp &b) -> bool { - int64_t colA = getConstantIntValue(a.getCol()).value(); - int64_t colB = getConstantIntValue(b.getCol()).value(); - return colA < colB; - }; - if (!targetTiles.empty()) { - // TODO(jornt): For now, for deterministic behaviour, sort on column - // index and use first one. This needs to be generalized to assign tiles - // based on a resource model. - std::sort(targetTiles.begin(), targetTiles.end(), colComparator); - Value col = targetTiles[0].getCol(); - tileResults.push_back( - rewriter.create(rewriter.getUnknownLoc(), col, row) - .getResult()); - } else if (!sourceTiles.empty()) { - // TODO(jornt): For now, for deterministic behaviour, sort on column - // index and use first one. This needs to be generalized to assign tiles - // based on a resource model. - std::sort(sourceTiles.begin(), sourceTiles.end(), colComparator); - Value col = sourceTiles[0].getCol(); - tileResults.push_back( - rewriter.create(rewriter.getUnknownLoc(), col, row) - .getResult()); - } else { - // Don't assign this logicalObjectFifo to a physical tile (yet!). Wait - // for other logical objectfifos to be assigned first. + if (!targetTiles.empty() && !sourceTiles.empty()) { + return logicalObjectFifo.emitOpError() + << "found logical objectfifo with both source and target tiles, " + "which is not supported yet"; + } else if (!targetTiles.empty()) { + // Create tile locations for this logical objectfifo based on target + // tiles. + if (failed(createTileLocations(targetTiles))) { + return failure(); + } + } else if (!sourceTiles.empty()) { + // Create tile locations for this logical objectfifo based on source + // tiles. + if (failed(createTileLocations(sourceTiles))) { return failure(); } } else { - return logicalObjectFifo.emitOpError() - << "found logical objectfifo with unknown memory space"; + // Don't assign this logicalObjectFifo to a physical tile (yet!). Wait + // for other logical objectfifos to be assigned first. + return failure(); } + // If no tile results, skip, and maybe in a next iteration another tile will // be found. - if (tileResults.empty()) { + if (tileLocations.empty()) { return failure(); } - // Extend this logical objectfifo's tile set. - SmallVector objFifoTiles = logicalObjectFifo.getTiles(); - DenseSet tileSet(objFifoTiles.begin(), objFifoTiles.end()); + rewriter.setInsertionPoint(logicalObjectFifo); + rewriter.replaceOpWithNewOp( + logicalObjectFifo, logicalObjectFifo.getMemref(), + tileLocations.takeVector()); + return success(); + } +}; - // If the logical objectfifo already contains all the new tiles, skip. - if (llvm::all_of(tileResults, - [&](Value val) { return tileSet.contains(val); })) { - return failure(); +/// Return the user DMA operations and corresponding assigned tiles in the +/// specified direction (source or target). +template +SmallVector>> +getUserDmasAndTiles(AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { + SmallVector>> + dmaOps; + for (Operation *user : logicalObjectFifo->getUsers()) { + if (auto dmaOp = dyn_cast(user)) { + ValueRange tileIndices; + if constexpr (OperateOn == CopyOpOperateOn::Source) { + if (dmaOp.getTargetObjectFifo() != logicalObjectFifo) continue; + tileIndices = dmaOp.getSourceObjectFifo().getTiles(); + } else if constexpr (OperateOn == CopyOpOperateOn::Target) { + if (dmaOp.getSourceObjectFifo() != logicalObjectFifo) continue; + tileIndices = dmaOp.getTargetObjectFifo().getTiles(); + } + SmallVector tiles; + for (Value index : tileIndices) + tiles.push_back(dyn_cast(index.getDefiningOp())); + dmaOps.push_back(std::make_pair(dmaOp, tiles)); + } + } + return dmaOps; +} + +/// Assign specific tile locations to objectFifos, starting from the set of +/// potential tile locations filled in earlier. +LogicalResult assignAieTilesAndDistributeLogicalObjectFifos(ModuleOp moduleOp) { + IRRewriter rewriter(moduleOp.getContext()); + + moduleOp->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { + Attribute memSpace = logicalObjectFifo.getMemorySpace(); + if (memSpace && dyn_cast(memSpace).getInt() != 1) + return WalkResult::advance(); + + SmallVector tiles = llvm::map_to_vector( + logicalObjectFifo.getTiles(), + [](Value tile) { return dyn_cast(tile.getDefiningOp()); }); + llvm::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileColumnComparator); + SmallVector>> + sourceDmaOps = + getUserDmasAndTiles(logicalObjectFifo); + SmallVector>> + targetDmaOps = + getUserDmasAndTiles(logicalObjectFifo); + + // Assign tiles for following cases: + // 1) No source DMA operations (e.g. L3 -> L2): distribute onto multiple + // tiles to potentially use multiple shim DMAs for reading from global + // memory in different columns. + // 2) No target DMA operations (e.g. L2 -> L3): + // distribute onto multiple tiles to potentially use multiple shim DMAs for + // writing to global memory in different columns. + // 3) Default: assign first tile from the sorted sequence of potential + // tiles. + if (sourceDmaOps.empty() && targetDmaOps.size() == tiles.size()) { + llvm::sort(targetDmaOps.begin(), targetDmaOps.end(), dmaColComparator); + for (auto &&[tile, dmaOpElem] : llvm::zip(tiles, targetDmaOps)) { + rewriter.setInsertionPoint(logicalObjectFifo); + SmallVector tileResults = {cast(tile.getResult())}; + auto newLogicalObjectFifo = + rewriter.create( + rewriter.getUnknownLoc(), + cast( + logicalObjectFifo.getOutput().getType()), + logicalObjectFifo.getMemref(), tileResults); + dmaOpElem.first->replaceUsesOfWith(logicalObjectFifo.getResult(), + newLogicalObjectFifo.getResult()); + } + } else if (targetDmaOps.empty() && sourceDmaOps.size() == tiles.size()) { + llvm::sort(sourceDmaOps.begin(), sourceDmaOps.end(), dmaColComparator); + for (auto &&[tile, dmaOpElem] : llvm::zip(tiles, sourceDmaOps)) { + rewriter.setInsertionPoint(logicalObjectFifo); + SmallVector tileResults = {cast(tile.getResult())}; + auto newLogicalObjectFifo = + rewriter.create( + rewriter.getUnknownLoc(), + cast( + logicalObjectFifo.getOutput().getType()), + logicalObjectFifo.getMemref(), tileResults); + dmaOpElem.first->replaceUsesOfWith(logicalObjectFifo.getResult(), + newLogicalObjectFifo.getResult()); + } + } else { + // For now, use first tile in sorted list. This will need to become more + // complex in the future to account for potential hardware limitations and + // constraints. + SmallVector tileResults = {cast(tiles[0].getResult())}; + rewriter.setInsertionPoint(logicalObjectFifo); + rewriter.replaceOpWithNewOp( + logicalObjectFifo, + cast(logicalObjectFifo.getOutput().getType()), + logicalObjectFifo.getMemref(), tileResults); } + return WalkResult::advance(); + }); + return success(); +} + +/// Allocate different memories for logical objectFifos on the same shared +/// memory tile to ensure different buffers will be used for them. +LogicalResult distributeSharedMemory(ModuleOp moduleOp) { + IRRewriter rewriter(moduleOp.getContext()); + + // Map from local objectfifos found to the tiles where they are used + DenseMap>, Value, LocationMapInfo> + locationsToMemref; + + moduleOp->walk([&](AMDAIE::LogicalObjectFifoFromMemrefOp logicalObjectFifo) { + Attribute memSpace = logicalObjectFifo.getMemorySpace(); + if (!memSpace || dyn_cast(memSpace).getInt() != 1) + return WalkResult::advance(); - // Concatenate existing with new tiles and replace the logicalObjectFifo - std::move(objFifoTiles.begin(), objFifoTiles.end(), - std::back_inserter(tileResults)); + SmallVector tiles = + llvm::map_to_vector(logicalObjectFifo.getTiles(), [](Value tile) { + return dyn_cast(tile.getDefiningOp()); + }); + llvm::sort(tiles.begin(), tiles.end(), + AMDAIE::TileOp::tileValueColumnAndRowComparator); + + SmallVector targetTiles; + (void)getUserTiles(logicalObjectFifo, targetTiles); + llvm::sort(targetTiles.begin(), targetTiles.end(), + AMDAIE::TileOp::tileValueColumnAndRowComparator); + tiles.insert(tiles.end(), std::make_move_iterator(targetTiles.begin()), + std::make_move_iterator(targetTiles.end())); + + SmallVector sourceTiles; + (void)getUserTiles(logicalObjectFifo, sourceTiles); + llvm::sort(sourceTiles.begin(), sourceTiles.end(), + AMDAIE::TileOp::tileValueColumnAndRowComparator); + tiles.insert(tiles.end(), std::make_move_iterator(sourceTiles.begin()), + std::make_move_iterator(sourceTiles.end())); + LLVM_DEBUG(llvm::dbgs() << "Op: " << logicalObjectFifo + << ", number of tiles: " << tiles.size() << "\n"); + + SmallVector> locations = + llvm::map_to_vector(tiles, [](AMDAIE::TileOp tile) { + return std::make_pair( + (int64_t)getConstantIntValue(tile.getCol()).value(), + (int64_t)getConstantIntValue(tile.getRow()).value()); + }); + if (!locationsToMemref.contains(locations)) { + auto allocOp = dyn_cast( + logicalObjectFifo.getMemref().getDefiningOp()); + rewriter.setInsertionPoint(allocOp); + auto newAllocOp = + dyn_cast(rewriter.clone(*allocOp.getOperation())); + auto newDeallocOp = rewriter.create( + rewriter.getUnknownLoc(), newAllocOp); + newDeallocOp->moveBefore(&newAllocOp->getBlock()->back()); + locationsToMemref[locations] = newAllocOp.getResult(); + } + rewriter.setInsertionPoint(logicalObjectFifo); rewriter.replaceOpWithNewOp( logicalObjectFifo, cast(logicalObjectFifo.getOutput().getType()), - logicalObjectFifo.getMemref(), tileResults); - return success(); - } -}; + locationsToMemref[locations], logicalObjectFifo.getTiles()); + return WalkResult::advance(); + }); + return success(); +} class AMDAIEDistributeCoresAndObjectFifosPass : public impl::AMDAIEDistributeCoresAndObjectFifosBase< @@ -488,33 +885,92 @@ class AMDAIEDistributeCoresAndObjectFifosPass void AMDAIEDistributeCoresAndObjectFifosPass::runOnOperation() { MLIRContext *context = &getContext(); ModuleOp moduleOp = getOperation(); + // Convert local scf.forall operations selected for parallel distribution to // nested scf.for operations. if (failed(localForallToFor(moduleOp))) { + moduleOp.emitOpError() + << "local `scf.forall` to `scf.for` conversion failed"; return signalPassFailure(); } + // Hoist the affine apply ops on scf.for induction variables to the // corresponding scf.for's body. if (failed(hoistAffineApplyDependingOnFor(moduleOp))) { + moduleOp.emitOpError() << "`affine.apply` hoisting failed"; return signalPassFailure(); } + LLVM_DEBUG(llvm::dbgs() << "Module after localForallToFor: \n" + << moduleOp << "\n"); + + if (failed(distributeLocalMemory(moduleOp))) { + moduleOp.emitOpError() << "local memory distribution failed"; + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() << "Module after distributeLocalMemory: \n" + << moduleOp << "\n"); + // Unroll local parallel loops and try hoisting dma operations if // possible. RewritePatternSet unrollLocalLoopsPatterns(context); unrollLocalLoopsPatterns.insert(context); if (failed(applyPatternsAndFoldGreedily( moduleOp, std::move(unrollLocalLoopsPatterns)))) { + moduleOp.emitOpError() + << "loop unrolling of loops selected for parallel execution failed"; + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() << "Module after AMDAIEUnrollLocalLoops: \n" + << moduleOp << "\n"); + + // Insert `amdaie.logicalobjectfifo.access` operations which retrieve the + // memrefs from logical objectfifos and update the computational operations to + // operate on these local memrefs. These access operations will be used to + // assign local AIE tiles to local logical objectFifos later. + if (failed(insertLogicalObjectFifoAccess(moduleOp))) { + moduleOp.emitOpError() + << "insertion of `amdaie.logicalobjectfif.access` operations failed"; return signalPassFailure(); } + LLVM_DEBUG(llvm::dbgs() << "Module after insertLogicalObjectFifoAccess: \n" + << moduleOp << "\n"); + // Assign tile locations to logical objectfifos on local (L1) memory. if (failed(assignLocalAieTiles(moduleOp))) { + moduleOp.emitOpError() << "local tile assignment failed"; return signalPassFailure(); } - // Assign tile locations to the remaining logical objectfifos. + LLVM_DEBUG(llvm::dbgs() << "Module after assignLocalAieTiles: \n" + << moduleOp << "\n"); + + // Assign a set of potential tile locations to the remaining logical + // objectFifos. RewritePatternSet assignAieTilePatters(context); - assignAieTilePatters.insert(context); + assignAieTilePatters.insert(context); if (failed(applyPatternsAndFoldGreedily(moduleOp, std::move(assignAieTilePatters)))) { + moduleOp.emitOpError() + << "collection of tile candidates for logical objectFifos failed"; + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() << "Module after FillAieTiles: \n" + << moduleOp << "\n"); + + // Assign specific tile locations to objectFifos, starting from the set of + // potential tile locations filled in earlier. + if (failed(assignAieTilesAndDistributeLogicalObjectFifos(moduleOp))) { + moduleOp.emitOpError() + << "tile assignment and logical objectFifo distribution failed"; + return signalPassFailure(); + } + LLVM_DEBUG(llvm::dbgs() + << "Module after assignAieTilesAndDistributeLogicalObjectFifos: \n" + << moduleOp << "\n"); + + // Allocate different memories for logical objectFifos on the same shared + // memory tile to ensure different buffers will be used for them. + if (failed(distributeSharedMemory(moduleOp))) { + moduleOp.emitOpError() << "distribution of shared memory failed"; return signalPassFailure(); } } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp index bb96c9f4d..64f0f39d8 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerToAIE.cpp @@ -264,6 +264,12 @@ LogicalResult coreToAIE(IRRewriter &rewriter, AMDAIE::CoreOp coreOp, auto walkResult = aieCoreOp.walk([&](Operation *op) { rewriter.setInsertionPoint(op); if (TypeSwitch(op) + .Case([&](auto accessOp) { + // TODO(jornt): Temporary until access operations are used for + // inserting synchronization stubs instead of consume/produce. + rewriter.eraseOp(accessOp); + return success(); + }) .Case([&](auto acquireOp) { return acquireOpToAIE(rewriter, acquireOp, mapper, localMemrefMapper); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir index 95e046396..3f2c9d123 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/distribute-cores-and-objectfifos.mlir @@ -1,4 +1,4 @@ -// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file %s | FileCheck %s +// RUN: iree-opt --pass-pipeline="builtin.module(iree-amdaie-distribute-cores-and-objectfifos,cse)" --split-input-file --verify-diagnostics %s | FileCheck %s // Check for unrolling an amdaie.core within a parallel loop with a single // induction variable with multiple iterations. There are no dma ops in this @@ -10,30 +10,24 @@ // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) { -// CHECK: amdaie.workgroup { -// CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK: %{{.*}} = amdaie.core(%[[TILE_0]]) -// CHECK: %[[TILE_1:.*]] = amdaie.tile(%[[C1]], %[[C2]]) -// CHECK: %{{.*}} = amdaie.core(%[[TILE_1]]) -// CHECK: %[[TILE_2:.*]] = amdaie.tile(%[[C2]], %[[C2]]) -// CHECK: %{{.*}} = amdaie.core(%[[TILE_2]]) -// CHECK: %[[TILE_3:.*]] = amdaie.tile(%[[C3]], %[[C2]]) -// CHECK: %{{.*}} = amdaie.core(%[[TILE_3]]) +// CHECK: %[[TILE_0:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK: %{{.*}} = amdaie.core(%[[TILE_0]]) +// CHECK: %[[TILE_1:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK: %{{.*}} = amdaie.core(%[[TILE_1]]) +// CHECK: %[[TILE_2:.*]] = amdaie.tile(%[[C2]], %[[C2]]) +// CHECK: %{{.*}} = amdaie.core(%[[TILE_2]]) +// CHECK: %[[TILE_3:.*]] = amdaie.tile(%[[C3]], %[[C2]]) +// CHECK: %{{.*}} = amdaie.core(%[[TILE_3]]) module { func.func @distribute_cores_and_objectfifos_1x4() { %c2 = arith.constant 2 : index scf.forall (%arg0, %arg1) in (1, 1) { - amdaie.workgroup { - scf.forall (%arg2, %arg3) in (1, 4) { - %tile = amdaie.tile(%arg3, %c2) - %21 = amdaie.core(%tile) { - amdaie.end - } - } {mapping = [#gpu.thread, #gpu.thread]} - amdaie.controlcode { + scf.forall (%arg2, %arg3) in (1, 4) { + %tile = amdaie.tile(%arg3, %c2) + %21 = amdaie.core(%tile) { amdaie.end } - } + } {mapping = [#gpu.thread, #gpu.thread]} } {mapping = [#gpu.block, #gpu.block]} return } @@ -48,29 +42,23 @@ module { // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK: scf.forall -// CHECK: amdaie.workgroup { -// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[CORE_0_0:.*]] = amdaie.core(%[[TILE_0_0]]) -// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[CORE_0_1:.*]] = amdaie.core(%[[TILE_0_1]]) -// CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]]) -// CHECK-DAG: %[[CORE_1_0:.*]] = amdaie.core(%[[TILE_1_0]]) -// CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]]) -// CHECK-DAG: %[[CORE_1_1:.*]] = amdaie.core(%[[TILE_1_1]]) +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[CORE_0_0:.*]] = amdaie.core(%[[TILE_0_0]]) +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[CORE_0_1:.*]] = amdaie.core(%[[TILE_0_1]]) +// CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK-DAG: %[[CORE_1_0:.*]] = amdaie.core(%[[TILE_1_0]]) +// CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]]) +// CHECK-DAG: %[[CORE_1_1:.*]] = amdaie.core(%[[TILE_1_1]]) module { func.func @distribute_cores_and_objectfifos_2x2() { scf.forall (%arg0, %arg1) in (1, 1) { - amdaie.workgroup { - scf.forall (%arg2, %arg3) in (2, 2) { - %tile = amdaie.tile(%arg3, %arg2) - %0 = amdaie.core(%tile) { - amdaie.end - } - } {mapping = [#gpu.thread, #gpu.thread]} - amdaie.controlcode { + scf.forall (%arg2, %arg3) in (2, 2) { + %tile = amdaie.tile(%arg3, %arg2) + %0 = amdaie.core(%tile) { amdaie.end } - } + } {mapping = [#gpu.thread, #gpu.thread]} } {mapping = [#gpu.block, #gpu.block]} return } @@ -86,46 +74,47 @@ module { // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1> -// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<32x1024xi32, 1> +// CHECK-DAG: %[[ALLOC_2:.*]] = memref.alloc() : memref<32x64xi32, 2> // CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) { -// CHECK: amdaie.workgroup { -// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) -// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_1]]} -// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} -// CHECK-DAG: %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_2]]} -// CHECK-DAG: %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]} -// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]] -// CHECK-SAME: %[[FROM_MEMREF_1]] -// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] -// CHECK-SAME: %[[FROM_MEMREF_0]] -// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]]) +// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_1]]} +// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_1_2]]} +// CHECK-DAG: %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_2]]} +// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]] +// CHECK-SAME: %[[FROM_MEMREF_1]] +// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) +// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK: linalg.fill ins(%{{.+}} : i32) +// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] +// CHECK-SAME: %[[FROM_MEMREF_0]] +// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]]) +// CHECK: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) +// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK: linalg.fill ins(%{{.+}} : i32) module { func.func @unroll_dma() { + %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index %alloc = memref.alloc() : memref<32x1024xi32, 1> %alloc_1 = memref.alloc() : memref<32x64xi32, 2> scf.forall (%arg0, %arg1) in (1, 1) { - amdaie.workgroup { - %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> - scf.forall (%arg2, %arg3) in (1, 2) { - %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3, %arg3] [%arg3, %arg3] [%arg3, %arg3]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %tile = amdaie.tile(%arg3, %c2) - %3 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) - amdaie.end - } - } {mapping = [#gpu.thread, #gpu.thread]} - amdaie.controlcode { + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> + scf.forall (%arg2, %arg3) in (1, 2) { + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3, %arg3] [%arg3, %arg3] [%arg3, %arg3]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile = amdaie.tile(%arg3, %c2) + %3 = amdaie.core(%tile) { + amdaie.logicalobjectfifo.consume(%2) + linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) amdaie.end } - } + } {mapping = [#gpu.thread, #gpu.thread]} } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_1 : memref<32x64xi32, 2> memref.dealloc %alloc : memref<32x1024xi32, 1> @@ -146,41 +135,39 @@ module { // CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1> // CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2> // CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) { -// CHECK: amdaie.workgroup { -// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) -// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} -// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]] -// CHECK-SAME: %[[TILE_1_2]] -// CHECK-SAME: %[[TILE_0_2]] -// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] -// CHECK-SAME: %[[FROM_MEMREF_0]] -// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]]) -// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_1_2]]} +// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] +// CHECK-SAME: %[[FROM_MEMREF_0]] +// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) +// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]]) +// CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) +// CHECK: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK: linalg.fill ins(%{{.+}} : i32) outs module { func.func @hoist_dma_single_loop() { + %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index %alloc = memref.alloc() : memref<32x1024xi32, 1> %alloc_1 = memref.alloc() : memref<32x64xi32, 2> scf.forall (%arg0, %arg1) in (1, 1) { - amdaie.workgroup { - %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> - scf.forall (%arg2, %arg3) in (1, 2) { - %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %tile = amdaie.tile(%arg3, %c2) - %3 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) - amdaie.end - } - } {mapping = [#gpu.thread, #gpu.thread]} - amdaie.controlcode { + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> + scf.forall (%arg2, %arg3) in (1, 2) { + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %tile = amdaie.tile(%arg3, %c2) + %3 = amdaie.core(%tile) { + amdaie.logicalobjectfifo.consume(%2) + linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) amdaie.end } - } + } {mapping = [#gpu.thread, #gpu.thread]} } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_1 : memref<32x64xi32, 2> memref.dealloc %alloc : memref<32x1024xi32, 1> @@ -198,7 +185,7 @@ module { // be hoisted. To check this, we use `CHECK-NOT: amdaie.dma_cpy_nd` after // already encountered once. // -// CHECK-LABEL: @hoist_dma_and_affine_single_loop +// CHECK-LABEL: @hoist_dma_and_affine_single_loop_2x1 // CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index // CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index @@ -206,42 +193,94 @@ module { // CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1> // CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2> // CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) { -// CHECK: amdaie.workgroup { -// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) -// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} -// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]] -// CHECK-SAME: %[[TILE_0_3]] -// CHECK-SAME: %[[TILE_0_2]] -// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] -// CHECK-NOT: amdaie.dma_cpy_nd -// CHECK-DAG: amdaie.core(%[[TILE_0_2]]) -// CHECK-DAG: amdaie.core(%[[TILE_0_3]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]]} +// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] +// CHECK-NOT: amdaie.dma_cpy_nd +// CHECK-DAG: amdaie.core(%[[TILE_0_2]]) +// CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) +// CHECK-DAG: amdaie.core(%[[TILE_0_3]]) +// CHECK: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) #map = affine_map<(d0) -> (d0 * 32)> module { - func.func @hoist_dma_and_affine_single_loop() { + func.func @hoist_dma_and_affine_single_loop_2x1() { + %c0_i32 = arith.constant 0 : i32 %alloc = memref.alloc() : memref<32x1024xi32, 1> %alloc_1 = memref.alloc() : memref<32x64xi32, 2> scf.forall (%arg0, %arg1) in (1, 1) { - amdaie.workgroup { - %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> - scf.forall (%arg2, %arg3) in (2, 1) { - %c2 = arith.constant 2 : index - %apply = affine.apply #map(%arg3) - %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%apply] [%c2] [%c2]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %add = arith.addi %arg2, %c2 : index - %tile = amdaie.tile(%arg3, %add) - %3 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) - amdaie.end - } - } {mapping = [#gpu.thread, #gpu.thread]} - amdaie.controlcode { + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> + scf.forall (%arg2, %arg3) in (2, 1) { + %c2 = arith.constant 2 : index + %apply = affine.apply #map(%arg3) + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%apply] [%c2] [%c2]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %add = arith.addi %arg2, %c2 : index + %tile = amdaie.tile(%arg3, %add) + %3 = amdaie.core(%tile) { + amdaie.logicalobjectfifo.consume(%2) + linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) amdaie.end } - } + } {mapping = [#gpu.thread, #gpu.thread]} + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc_1 : memref<32x64xi32, 2> + memref.dealloc %alloc : memref<32x1024xi32, 1> + return + } +} + +// ----- + +// Check for unrolling a parallel loop, with both cores and dma ops. The dma op +// does depend on one of the induction variables and can't be hoisted. However, +// in this test, the DMA operation does depend on an affine apply operation +// within the `scf.for` operation's scope and checks whether both the affine +// apply and the DMA can be unrolled correctly. +// +// CHECK-LABEL: @unroll_dma_and_affine_single_loop +// CHECK-DAG: %[[C0:.*]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.*]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index +// CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2> +// CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) { +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_3]]} +// CHECK-DAG: %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]} +// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] +// CHECK-DAG: amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) +// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] +// CHECK-DAG: amdaie.core(%[[TILE_0_3]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) +#map = affine_map<(d0) -> (d0 * 32)> +module { + func.func @unroll_dma_and_affine_single_loop() { + %c0_i32 = arith.constant 0 : i32 + %alloc = memref.alloc() : memref<32x1024xi32, 1> + %alloc_1 = memref.alloc() : memref<32x64xi32, 2> + scf.forall (%arg0, %arg1) in (1, 1) { + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> + scf.forall (%arg2, %arg3) in (2, 1) { + %c2 = arith.constant 2 : index + %apply = affine.apply #map(%arg2) + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%apply] [%c2] [%c2]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %add = arith.addi %arg2, %c2 : index + %tile = amdaie.tile(%arg3, %add) + %3 = amdaie.core(%tile) { + amdaie.logicalobjectfifo.consume(%2) + linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) + amdaie.end + } + } {mapping = [#gpu.thread, #gpu.thread]} } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_1 : memref<32x64xi32, 2> memref.dealloc %alloc : memref<32x1024xi32, 1> @@ -263,50 +302,46 @@ module { // CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1> // CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2> // CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) { -// CHECK: amdaie.workgroup { -// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) -// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) -// CHECK-DAG: %[[TILE_1_3:.*]] = amdaie.tile(%[[C1]], %[[C3]]) -// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} -// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]] -// CHECK-SAME: %[[TILE_1_3]] -// CHECK-SAME: %[[TILE_0_3]] -// CHECK-SAME: %[[TILE_1_2]] -// CHECK-SAME: %[[TILE_0_2]] -// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] -// CHECK-SAME: %[[FROM_MEMREF_0]] -// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[TILE_1_3:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]], %[[TILE_0_3]], %[[TILE_1_2]], %[[TILE_1_3]]} +// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] +// CHECK-SAME: %[[FROM_MEMREF_0]] +// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_1]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) module { func.func @hoist_dma_multi_loop() { + %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index %alloc = memref.alloc() : memref<32x1024xi32, 1> %alloc_1 = memref.alloc() : memref<32x64xi32, 2> scf.forall (%arg0, %arg1) in (1, 1) { - amdaie.workgroup { - %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> - scf.forall (%arg2, %arg3) in (2, 2) { - %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %add = arith.addi %arg2, %c2 : index - %tile = amdaie.tile(%arg3, %add) - %3 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) - amdaie.end - } - } {mapping = [#gpu.thread, #gpu.thread]} - amdaie.controlcode { + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> + scf.forall (%arg2, %arg3) in (2, 2) { + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %add = arith.addi %arg2, %c2 : index + %tile = amdaie.tile(%arg3, %add) + %3 = amdaie.core(%tile) { + amdaie.logicalobjectfifo.consume(%2) + linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) amdaie.end } - } + } {mapping = [#gpu.thread, #gpu.thread]} } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_1 : memref<32x64xi32, 2> memref.dealloc %alloc : memref<32x1024xi32, 1> @@ -326,57 +361,50 @@ module { // CHECK-DAG: %[[C2:.*]] = arith.constant 2 : index // CHECK-DAG: %[[C3:.*]] = arith.constant 3 : index // CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<32x1024xi32, 1> -// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<32x64xi32, 2> +// CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<32x1024xi32, 1> +// CHECK-DAG: %[[ALLOC_2:.*]] = memref.alloc() : memref<32x64xi32, 2> // CHECK: scf.forall (%[[ARG0:.*]], %[[ARG1:.*]]) in (1, 1) { -// CHECK: amdaie.workgroup { -// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) -// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) -// CHECK-DAG: %[[TILE_1_3:.*]] = amdaie.tile(%[[C1]], %[[C3]]) -// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_1]]} -// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} -// CHECK-DAG: %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]] -// CHECK-SAME: %[[TILE_1_3]] -// CHECK-SAME: %[[TILE_1_2]] -// CHECK-DAG: %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]] -// CHECK-SAME: %[[TILE_0_3]] -// CHECK-SAME: %[[TILE_0_2]] -// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]] -// CHECK-SAME: %[[FROM_MEMREF_1]] -// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] -// CHECK-SAME: %[[FROM_MEMREF_0]] -// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) -// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_3:.*]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[TILE_1_3:.*]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%[[C1]], %[[C1]]) +// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_1]]} +// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_1_2]], %[[TILE_1_3]]} +// CHECK-DAG: %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_2]], %[[TILE_0_3]]} +// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]] +// CHECK-SAME: %[[FROM_MEMREF_1]] +// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] +// CHECK-SAME: %[[FROM_MEMREF_0]] +// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_0]]) +// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) module { func.func @hoist_dma_one_of_multi_loop() { + %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index %alloc = memref.alloc() : memref<32x1024xi32, 1> %alloc_1 = memref.alloc() : memref<32x64xi32, 2> scf.forall (%arg0, %arg1) in (1, 1) { - amdaie.workgroup { - %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> - scf.forall (%arg2, %arg3) in (2, 2) { - %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3] [%arg3] [%arg3]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %add = arith.addi %arg2, %c2 : index - %tile = amdaie.tile(%arg3, %add) - %3 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%2) - amdaie.end - } - } {mapping = [#gpu.thread, #gpu.thread]} - amdaie.controlcode { + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32, 1> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> + scf.forall (%arg2, %arg3) in (2, 2) { + %2 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3] [%arg3] [%arg3]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %add = arith.addi %arg2, %c2 : index + %tile = amdaie.tile(%arg3, %add) + %3 = amdaie.core(%tile) { + amdaie.logicalobjectfifo.consume(%2) + linalg.fill ins(%c0_i32 : i32) outs(%alloc_1 : memref<32x64xi32, 2>) amdaie.end } - } + } {mapping = [#gpu.thread, #gpu.thread]} } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_1 : memref<32x64xi32, 2> memref.dealloc %alloc : memref<32x1024xi32, 1> @@ -398,64 +426,69 @@ module { // CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index // CHECK-DAG: %[[ALLOC_0:.+]] = memref.alloc() : memref<32x1024xi32> // CHECK-DAG: %[[ALLOC_1:.+]] = memref.alloc() : memref<32x64xi32, 1> -// CHECK-DAG: %[[ALLOC_2:.+]] = memref.alloc() : memref<32x64xi32, 2> +// CHECK-DAG: %[[ALLOC_2:.+]] = memref.alloc() : memref<32x64xi32, 1> +// CHECK-DAG: %[[ALLOC_3:.+]] = memref.alloc() : memref<32x64xi32, 2> // CHECK: scf.forall (%[[ARG0:.+]], %[[ARG1:.+]]) in (1, 1) { -// CHECK: amdaie.workgroup { -// CHECK-DAG: %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]]) -// CHECK-DAG: %[[TILE_0_3:.+]] = amdaie.tile(%[[C0]], %[[C3]]) -// CHECK-DAG: %[[TILE_1_2:.+]] = amdaie.tile(%[[C1]], %[[C2]]) -// CHECK-DAG: %[[TILE_1_3:.+]] = amdaie.tile(%[[C1]], %[[C3]]) -// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) -// CHECK-DAG: %[[TILE_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) -// CHECK-DAG: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) -// CHECK-DAG: %[[TILE_1_1:.+]] = amdaie.tile(%[[C1]], %[[C1]]) -// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_0]]} -// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} -// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_1]]} -// CHECK-DAG: %[[FROM_MEMREF_3:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]} -// CHECK-DAG: %[[FROM_MEMREF_4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_1_3]], %[[TILE_1_2]]} -// CHECK-DAG: %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_3]], %[[TILE_0_2]]} -// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]] -// CHECK-SAME: %[[FROM_MEMREF_1]] -// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_5]] -// CHECK-SAME: %[[FROM_MEMREF_3]] -// CHECK-DAG: %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] -// CHECK-SAME: %[[FROM_MEMREF_0]] -// CHECK-DAG: %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]] -// CHECK-SAME: %[[FROM_MEMREF_2]] -// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_3]]) -// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_3]]) +// CHECK-DAG: %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_3:.+]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK-DAG: %[[TILE_1_2:.+]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[TILE_1_3:.+]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[TILE_1_0:.+]] = amdaie.tile(%[[C1]], %[[C0]]) +// CHECK-DAG: %[[TILE_1_1:.+]] = amdaie.tile(%[[C1]], %[[C1]]) +// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_1_0]]} +// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_1]]} +// CHECK-DAG: %[[FROM_MEMREF_3:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_2]], %[[TILE_1_3]]} +// CHECK-DAG: %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_2]], %[[TILE_0_3]]} +// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]] +// CHECK-SAME: %[[FROM_MEMREF_1]] +// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_5]] +// CHECK-SAME: %[[FROM_MEMREF_3]] +// CHECK-DAG: %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] +// CHECK-SAME: %[[FROM_MEMREF_0]] +// CHECK-DAG: %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]] +// CHECK-SAME: %[[FROM_MEMREF_2]] +// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_3]]) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_3]]) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs module { func.func @hoist_dma_dependencies() { + %c0_i32 = arith.constant 0 : i32 %c2 = arith.constant 2 : index %alloc = memref.alloc() : memref<32x1024xi32> %alloc_1 = memref.alloc() : memref<32x64xi32, 1> %alloc_2 = memref.alloc() : memref<32x64xi32, 2> scf.forall (%arg0, %arg1) in (1, 1) { - amdaie.workgroup { - %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32> -> !amdaie.logicalobjectfifo> - %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 1> -> !amdaie.logicalobjectfifo> - %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> - scf.forall (%arg2, %arg3) in (2, 2) { - %3 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3] [%arg3] [%arg3]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %4 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %add = arith.addi %arg2, %c2 : index - %tile = amdaie.tile(%arg3, %add) - %core = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%4) - amdaie.end - } - } {mapping = [#gpu.thread, #gpu.thread]} - amdaie.controlcode { + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x64xi32, 1> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> + scf.forall (%arg2, %arg3) in (2, 2) { + %3 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg3] [%arg3] [%arg3]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %4 = amdaie.dma_cpy_nd(%2[] [] [], %1[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %add = arith.addi %arg2, %c2 : index + %tile = amdaie.tile(%arg3, %add) + %core = amdaie.core(%tile) { + amdaie.logicalobjectfifo.consume(%4) + linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<32x64xi32, 2>) amdaie.end } - } + } {mapping = [#gpu.thread, #gpu.thread]} } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_2 : memref<32x64xi32, 2> memref.dealloc %alloc_1 : memref<32x64xi32, 1> @@ -466,68 +499,182 @@ module { // ----- -// CHECK-LABEL: @distribute_cores_and_objectfifos +// Check dependencies of DMAs on preceding DMAs at different loop levels. +// +// CHECK-LABEL: @nested_dma_dependencies +// CHECK-DAG: %[[C0:.+]] = arith.constant 0 : index +// CHECK-DAG: %[[C1:.+]] = arith.constant 1 : index +// CHECK-DAG: %[[C2:.+]] = arith.constant 2 : index +// CHECK-DAG: %[[C3:.+]] = arith.constant 3 : index +// CHECK-DAG: %[[ALLOC_0:.+]] = memref.alloc() : memref<32x1024xi32> +// CHECK-DAG: %[[ALLOC_1:.+]] = memref.alloc() : memref<32x128xi32, 1> +// CHECK-DAG: %[[ALLOC_2:.+]] = memref.alloc() : memref<32x64xi32, 2> +// CHECK-DAG: %[[ALLOC_3:.+]] = memref.alloc() : memref<32x32xi32, 2> +// CHECK-DAG: %[[ALLOC_4:.+]] = memref.alloc() : memref<2x2x32x32xi32, 1> +// CHECK-DAG: %[[ALLOC_5:.+]] = memref.alloc() : memref<64x64xi32> +// CHECK: scf.forall (%{{.+}}, %[[ARG1:.+]]) in (2, 2) +// CHECK-DAG: %[[TILE_0_2:.+]] = amdaie.tile(%[[C0]], %[[C2]]) +// CHECK-DAG: %[[TILE_0_3:.+]] = amdaie.tile(%[[C0]], %[[C3]]) +// CHECK-DAG: %[[TILE_1_2:.+]] = amdaie.tile(%[[C1]], %[[C2]]) +// CHECK-DAG: %[[TILE_1_3:.+]] = amdaie.tile(%[[C1]], %[[C3]]) +// CHECK-DAG: %[[TILE_0_0:.+]] = amdaie.tile(%[[C0]], %[[C0]]) +// CHECK-DAG: %[[TILE_0_1:.+]] = amdaie.tile(%[[C0]], %[[C1]]) +// CHECK-DAG: %[[FROM_MEMREF_0:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[FROM_MEMREF_1:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_2:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_3]], %[[TILE_1_3]]} +// CHECK-DAG: %[[FROM_MEMREF_3:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_2]], %[[TILE_1_2]]} +// CHECK-DAG: %[[FROM_MEMREF_4:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_2]]} +// CHECK-DAG: %[[FROM_MEMREF_5:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_2]]} +// CHECK-DAG: %[[FROM_MEMREF_6:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_0_3]]} +// CHECK-DAG: %[[FROM_MEMREF_7:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_3]]} +// CHECK-DAG: %[[FROM_MEMREF_8:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_9:.+]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]][] [] [], %[[FROM_MEMREF_0]][%[[ARG1]]] +// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]][] [] [], %[[FROM_MEMREF_1]] +// CHECK-DAG: %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_4]] +// CHECK-DAG: %[[CORE_0_2:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_4]], Write) +// CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c0, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_5]] +// CHECK-DAG: %[[CORE_1_2:.*]] = amdaie.core(%[[TILE_1_2]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write) +// CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_3]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: %[[DMA_4:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]][] [] [], %[[FROM_MEMREF_1]] +// CHECK-DAG: %[[DMA_5:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c0] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_6]] +// CHECK-DAG: %[[CORE_0_3:.*]] = amdaie.core(%[[TILE_0_3]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_6]], Write) +// CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_4]]) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]][%c1, %c1] [%c1, %c1] [%c1, %c1], %[[FROM_MEMREF_7]] +// CHECK-DAG: %[[CORE_1_3:.*]] = amdaie.core(%[[TILE_1_3]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Write) +// CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_2]], Read) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_4]]) +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: linalg.fill ins(%{{.+}} : i32) outs +// CHECK-DAG: %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]][%[[ARG1]]] [%c1] [%c1], %[[FROM_MEMREF_8]] +module { + func.func @nested_dma_dependencies() { + %c0_i32 = arith.constant 0 : i32 + %c1 = arith.constant 1 : index + %c2 = arith.constant 2 : index + %alloc = memref.alloc() : memref<32x1024xi32> + %alloc_1 = memref.alloc() : memref<32x128xi32, 1> + %alloc_2 = memref.alloc() : memref<32x64xi32, 2> + %alloc_3 = memref.alloc() : memref<32x32xi32, 2> + %alloc_4 = memref.alloc() : memref<2x2x32x32xi32, 1> + %alloc_5 = memref.alloc() : memref<64x64xi32> + scf.forall (%arg0, %arg1) in (2, 2) { + %0 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<32x1024xi32> -> !amdaie.logicalobjectfifo> + %1 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<32x128xi32, 1> -> !amdaie.logicalobjectfifo> + %2 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<32x64xi32, 2> -> !amdaie.logicalobjectfifo> + %3 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<32x32xi32, 2> -> !amdaie.logicalobjectfifo> + %4 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<2x2x32x32xi32, 1> -> !amdaie.logicalobjectfifo> + %5 = amdaie.logicalobjectfifo.from_memref %alloc_5, {} : memref<64x64xi32> -> !amdaie.logicalobjectfifo> + %6 = amdaie.dma_cpy_nd(%1[] [] [], %0[%arg1] [%c1] [%c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + scf.forall (%arg2, %arg3) in (2, 2) { + %7 = amdaie.dma_cpy_nd(%2[] [] [], %1[%arg2] [%c1] [%c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %8 = amdaie.dma_cpy_nd(%4[%arg2, %arg3] [%c1, %c1] [%c1, %c1], %3[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %add = arith.addi %arg2, %c2 : index + %tile = amdaie.tile(%arg3, %add) + %core = amdaie.core(%tile) { + amdaie.logicalobjectfifo.consume(%7) + linalg.fill ins(%c0_i32 : i32) outs(%alloc_2 : memref<32x64xi32, 2>) + linalg.fill ins(%c0_i32 : i32) outs(%alloc_3 : memref<32x32xi32, 2>) + amdaie.logicalobjectfifo.produce(%8) + amdaie.end + } + } {mapping = [#gpu.thread, #gpu.thread]} + %9 = amdaie.dma_cpy_nd(%5[%arg1] [%c1] [%c1], %4[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + } {mapping = [#gpu.block, #gpu.block]} + memref.dealloc %alloc_5 : memref<64x64xi32> + memref.dealloc %alloc_4 : memref<2x2x32x32xi32, 1> + memref.dealloc %alloc_3 : memref<32x32xi32, 2> + memref.dealloc %alloc_2 : memref<32x64xi32, 2> + memref.dealloc %alloc_1 : memref<32x128xi32, 1> + memref.dealloc %alloc : memref<32x1024xi32> + return + } +} + +// ----- + +// CHECK-LABEL: @distribute_cores_and_objectfifos // CHECK-DAG: %[[IN_B:.*]] = hal.interface.binding.subspan set(0) binding(1) // CHECK-DAG: %[[IN_A:.*]] = hal.interface.binding.subspan set(0) binding(0) // CHECK-DAG: %[[OUTPUT:.*]] = hal.interface.binding.subspan set(0) binding(2) // CHECK-DAG: %[[ALLOC:.*]] = memref.alloc() : memref<4x8x8x8xi32, 2> // CHECK-DAG: %[[ALLOC_0:.*]] = memref.alloc() : memref<8x8x4x8xi32, 2> // CHECK-DAG: %[[ALLOC_1:.*]] = memref.alloc() : memref<4x8x4x8xi32, 2> -// CHECK-DAG: %[[ALLOC_2:.*]] = memref.alloc() : memref<32x32xi32, 1> +// CHECK-DAG: %[[ALLOC_2:.*]] = memref.alloc() : memref<32x64xi32, 1> // CHECK-DAG: %[[ALLOC_3:.*]] = memref.alloc() : memref<64x32xi32, 1> -// CHECK-DAG: %[[ALLOC_4:.*]] = memref.alloc() : memref<32x64xi32, 1> -// CHECK-DAG: scf.forall -// CHECK-SAME: in (1, 1) -// CHECK-DAG: amdaie.workgroup { -// CHECK-DAG: %[[TILE:.*]] = amdaie.tile(%c1, %c2) -// CHECK-DAG: %[[TILE_5:.*]] = amdaie.tile(%c0, %c2) -// CHECK-DAG: %[[TILE_6:.*]] = amdaie.tile(%c0, %c1) -// CHECK-DAG: %[[TILE_7:.*]] = amdaie.tile(%c1, %c1) -// CHECK-DAG: %[[TILE_8:.*]] = amdaie.tile(%c1, %c0) -// CHECK-DAG: %[[TILE_9:.*]] = amdaie.tile(%c0, %c0) -// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_6]]} -// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_7]]} -// CHECK-DAG: %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_6]]} -// CHECK-DAG: %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_7]]} -// CHECK-DAG: %[[FROM_MEMREF_4:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_6]]} -// CHECK-DAG: %[[FROM_MEMREF_5:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE]]} -// CHECK-DAG: %[[FROM_MEMREF_6:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_5]]} -// CHECK-DAG: %[[FROM_MEMREF_7:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE]], %[[TILE_5]]} -// CHECK-DAG: %[[FROM_MEMREF_8:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE]]} -// CHECK-DAG: %[[FROM_MEMREF_9:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_5]]} -// CHECK-DAG: %[[FROM_MEMREF_10:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_8]]} -// CHECK-DAG: %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_9]]} -// CHECK-DAG: %[[FROM_MEMREF_12:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_9]]} -// CHECK-DAG: %[[FROM_MEMREF_13:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_8]]} -// CHECK-DAG: %[[FROM_MEMREF_14:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_9]]} -// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_0]] -// CHECK-SAME: %[[FROM_MEMREF_12]] -// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_7]] -// CHECK-SAME: %[[FROM_MEMREF_0]] -// CHECK-DAG: %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] -// CHECK-SAME: %[[FROM_MEMREF_14]] -// CHECK-DAG: %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]] -// CHECK-SAME: %[[FROM_MEMREF_2]] -// CHECK-DAG: %[[DMA_4:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]] -// CHECK-SAME: %[[FROM_MEMREF_6]] -// CHECK-DAG: %[[DMA_5:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_11]] -// CHECK-SAME: %[[FROM_MEMREF_4]] -// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_5]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_3]]) -// CHECK-DAG: amdaie.logicalobjectfifo.produce(%[[DMA_4]]) -// CHECK-DAG: %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] -// CHECK-SAME: %[[FROM_MEMREF_13]] -// CHECK-DAG: %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]] -// CHECK-SAME: %[[FROM_MEMREF_1]] -// CHECK-DAG: %[[DMA_8:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]] -// CHECK-SAME: %[[FROM_MEMREF_5]] -// CHECK-DAG: %[[DMA_9:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_10]] -// CHECK-SAME: %[[FROM_MEMREF_3]] -// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) -// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_7]]) -// CHECK-DAG: amdaie.logicalobjectfifo.produce(%[[DMA_8]]) +// CHECK-DAG: %[[ALLOC_4:.*]] = memref.alloc() : memref<64x32xi32, 1> +// CHECK-DAG: %[[ALLOC_5:.*]] = memref.alloc() : memref<32x32xi32, 1> +// CHECK-DAG: %[[ALLOC_6:.*]] = memref.alloc() : memref<32x32xi32, 1> +// CHECK-DAG: scf.forall (%{{.+}}, %{{.+}}) in (1, 1) +// CHECK-DAG: %[[TILE_1_2:.*]] = amdaie.tile(%c1, %c2) +// CHECK-DAG: %[[TILE_0_2:.*]] = amdaie.tile(%c0, %c2) +// CHECK-DAG: %[[TILE_0_1:.*]] = amdaie.tile(%c0, %c1) +// CHECK-DAG: %[[TILE_1_1:.*]] = amdaie.tile(%c1, %c1) +// CHECK-DAG: %[[TILE_1_0:.*]] = amdaie.tile(%c1, %c0) +// CHECK-DAG: %[[TILE_0_0:.*]] = amdaie.tile(%c0, %c0) +// CHECK-DAG: %[[FROM_MEMREF_0:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_2]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_1:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_3]], {%[[TILE_1_1]]} +// CHECK-DAG: %[[FROM_MEMREF_2:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_4]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_3:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_5]], {%[[TILE_1_1]]} +// CHECK-DAG: %[[FROM_MEMREF_4:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_6]], {%[[TILE_0_1]]} +// CHECK-DAG: %[[FROM_MEMREF_5:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_1_2]]} +// CHECK-DAG: %[[FROM_MEMREF_6:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_1]], {%[[TILE_0_2]]} +// CHECK-DAG: %[[FROM_MEMREF_7:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC_0]], {%[[TILE_0_2]], %[[TILE_1_2]]} +// CHECK-DAG: %[[FROM_MEMREF_8:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_1_2]]} +// CHECK-DAG: %[[FROM_MEMREF_9:.*]] = amdaie.logicalobjectfifo.from_memref %[[ALLOC]], {%[[TILE_0_2]]} +// CHECK-DAG: %[[FROM_MEMREF_10:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_1_0]]} +// CHECK-DAG: %[[FROM_MEMREF_11:.*]] = amdaie.logicalobjectfifo.from_memref %[[OUTPUT]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[FROM_MEMREF_12:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_A]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[FROM_MEMREF_13:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_1_0]]} +// CHECK-DAG: %[[FROM_MEMREF_14:.*]] = amdaie.logicalobjectfifo.from_memref %[[IN_B]], {%[[TILE_0_0]]} +// CHECK-DAG: %[[DMA_0:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_0]] +// CHECK-SAME: %[[FROM_MEMREF_12]] +// CHECK-DAG: %[[DMA_1:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_7]] +// CHECK-SAME: %[[FROM_MEMREF_0]] +// CHECK-DAG: %[[DMA_2:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_2]] +// CHECK-SAME: %[[FROM_MEMREF_14]] +// CHECK-DAG: %[[DMA_3:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_9]] +// CHECK-SAME: %[[FROM_MEMREF_2]] +// CHECK-DAG: %[[DMA_4:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_4]] +// CHECK-SAME: %[[FROM_MEMREF_6]] +// CHECK-DAG: %[[DMA_5:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_11]] +// CHECK-SAME: %[[FROM_MEMREF_4]] +// CHECK-DAG: %[[CORE_0:.*]] = amdaie.core(%[[TILE_0_2]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Read) +// CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_9]], Read) +// CHECK-DAG: %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_6]], Write) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_3]]) +// CHECK-DAG: amdaie.logicalobjectfifo.produce(%[[DMA_4]]) +// CHECK-DAG: %[[DMA_6:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_1]] +// CHECK-SAME: %[[FROM_MEMREF_13]] +// CHECK-DAG: %[[DMA_7:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_8]] +// CHECK-SAME: %[[FROM_MEMREF_1]] +// CHECK-DAG: %[[DMA_8:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_3]] +// CHECK-SAME: %[[FROM_MEMREF_5]] +// CHECK-DAG: %[[DMA_9:.*]] = amdaie.dma_cpy_nd(%[[FROM_MEMREF_10]] +// CHECK-SAME: %[[FROM_MEMREF_3]] +// CHECK-DAG: %[[CORE_1:.*]] = amdaie.core(%[[TILE_1_2]]) +// CHECK-DAG: %[[VAL_0:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_7]], Read) +// CHECK-DAG: %[[VAL_1:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_8]], Read) +// CHECK-DAG: %[[VAL_2:.+]] = amdaie.logicalobjectfifo.access(%[[FROM_MEMREF_5]], Write) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_1]]) +// CHECK-DAG: amdaie.logicalobjectfifo.consume(%[[DMA_7]]) +// CHECK-DAG: amdaie.logicalobjectfifo.produce(%[[DMA_8]]) #map = affine_map<(d0) -> (d0 * 32)> #map1 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d2, d0, d3, d5)> #map2 = affine_map<(d0, d1, d2, d3, d4, d5) -> (d1, d2, d5, d4)> @@ -558,44 +705,39 @@ module { %alloc_3 = memref.alloc() : memref<64x32xi32, 1> %alloc_4 = memref.alloc() : memref<32x64xi32, 1> scf.forall (%arg0, %arg1) in (1, 1) { - amdaie.workgroup { - %3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<32x64xi32, 1> -> !amdaie.logicalobjectfifo> - %4 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<64x32xi32, 1> -> !amdaie.logicalobjectfifo> - %5 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> - %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %7 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> - %8 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<4x8x8x8xi32, 2> -> !amdaie.logicalobjectfifo> - %9 = amdaie.logicalobjectfifo.from_memref %2, {} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> - %10 = amdaie.logicalobjectfifo.from_memref %1, {} : memref<32x1024xi32> -> !amdaie.logicalobjectfifo> - %11 = amdaie.logicalobjectfifo.from_memref %0, {} : memref<1024x64xi32> -> !amdaie.logicalobjectfifo> - scf.forall (%arg2, %arg3) in (1, 2) { - %12 = affine.apply #map(%arg2) - %13 = affine.apply #map(%arg3) - %14 = amdaie.dma_cpy_nd(%3[] [] [], %10[%12, %c960] [%c32, %c64] [%c1024, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %15 = amdaie.dma_cpy_nd(%4[] [] [], %11[%c960, %13] [%c64, %c32] [%c64, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %16 = amdaie.dma_cpy_nd(%7[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %3[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %17 = amdaie.dma_cpy_nd(%8[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c512, %c64, %c8, %c1], %4[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c8, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %18 = amdaie.dma_cpy_nd(%5[%c0, %c0] [%c32, %c32] [%c32, %c1], %6[%c0, %c0, %c0, %c0] [%c8, %c4, %c4, %c8] [%c32, %c8, %c256, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %19 = amdaie.dma_cpy_nd(%9[%12, %13] [%c32, %c32] [%c64, %c1], %5[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) - %20 = arith.addi %arg2, %c2 : index - %tile = amdaie.tile(%arg3, %20) - %21 = amdaie.core(%tile) { - amdaie.logicalobjectfifo.consume(%16) - amdaie.logicalobjectfifo.consume(%17) - linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) { - ^bb0(%in: i32, %in_5: i32, %out: i32): - %22 = arith.muli %in, %in_5 : i32 - %23 = arith.addi %out, %22 : i32 - linalg.yield %23 : i32 - } - amdaie.logicalobjectfifo.produce(%18) - amdaie.end + %3 = amdaie.logicalobjectfifo.from_memref %alloc_4, {} : memref<32x64xi32, 1> -> !amdaie.logicalobjectfifo> + %4 = amdaie.logicalobjectfifo.from_memref %alloc_3, {} : memref<64x32xi32, 1> -> !amdaie.logicalobjectfifo> + %5 = amdaie.logicalobjectfifo.from_memref %alloc_2, {} : memref<32x32xi32, 1> -> !amdaie.logicalobjectfifo> + %6 = amdaie.logicalobjectfifo.from_memref %alloc_1, {} : memref<4x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> + %7 = amdaie.logicalobjectfifo.from_memref %alloc_0, {} : memref<8x8x4x8xi32, 2> -> !amdaie.logicalobjectfifo> + %8 = amdaie.logicalobjectfifo.from_memref %alloc, {} : memref<4x8x8x8xi32, 2> -> !amdaie.logicalobjectfifo> + %9 = amdaie.logicalobjectfifo.from_memref %2, {} : memref<32x64xi32> -> !amdaie.logicalobjectfifo> + %10 = amdaie.logicalobjectfifo.from_memref %1, {} : memref<32x1024xi32> -> !amdaie.logicalobjectfifo> + %11 = amdaie.logicalobjectfifo.from_memref %0, {} : memref<1024x64xi32> -> !amdaie.logicalobjectfifo> + scf.forall (%arg2, %arg3) in (1, 2) { + %12 = affine.apply #map(%arg2) + %13 = affine.apply #map(%arg3) + %14 = amdaie.dma_cpy_nd(%3[] [] [], %10[%12, %c960] [%c32, %c64] [%c1024, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %15 = amdaie.dma_cpy_nd(%4[] [] [], %11[%c960, %13] [%c64, %c32] [%c64, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %16 = amdaie.dma_cpy_nd(%7[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c256, %c32, %c8, %c1], %3[%c0, %c0, %c0, %c0] [%c8, %c8, %c4, %c8] [%c8, %c256, %c64, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %17 = amdaie.dma_cpy_nd(%8[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c512, %c64, %c8, %c1], %4[%c0, %c0, %c0, %c0] [%c4, %c8, %c8, %c8] [%c8, %c256, %c32, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %18 = amdaie.dma_cpy_nd(%5[%c0, %c0] [%c32, %c32] [%c32, %c1], %6[%c0, %c0, %c0, %c0] [%c8, %c4, %c4, %c8] [%c32, %c8, %c256, %c1]) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %19 = amdaie.dma_cpy_nd(%9[%12, %13] [%c32, %c32] [%c64, %c1], %5[] [] []) : (!amdaie.logicalobjectfifo>, !amdaie.logicalobjectfifo>) + %20 = arith.addi %arg2, %c2 : index + %tile = amdaie.tile(%arg3, %20) + %21 = amdaie.core(%tile) { + amdaie.logicalobjectfifo.consume(%16) + amdaie.logicalobjectfifo.consume(%17) + linalg.generic {indexing_maps = [#map1, #map2, #map3], iterator_types = ["parallel", "parallel", "reduction", "parallel", "parallel", "reduction"]} ins(%alloc_0, %alloc : memref<8x8x4x8xi32, 2>, memref<4x8x8x8xi32, 2>) outs(%alloc_1 : memref<4x8x4x8xi32, 2>) { + ^bb0(%in: i32, %in_5: i32, %out: i32): + %22 = arith.muli %in, %in_5 : i32 + %23 = arith.addi %out, %22 : i32 + linalg.yield %23 : i32 } - } {mapping = [#gpu.thread, #gpu.thread]} - amdaie.controlcode { + amdaie.logicalobjectfifo.produce(%18) amdaie.end } - } + } {mapping = [#gpu.thread, #gpu.thread]} } {mapping = [#gpu.block, #gpu.block]} memref.dealloc %alloc_4 : memref<32x64xi32, 1> memref.dealloc %alloc_3 : memref<64x32xi32, 1> diff --git a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc index d522dd0d9..d721cef5f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc @@ -326,13 +326,19 @@ static iree_status_t iree_hal_xrt_direct_command_buffer_dispatch( xrt::run run = xrt::run(kernel); + // set opcode for transaction binary execution + unsigned int opcode = 3; + // Index to push arguments on the kernel. iree_host_size_t arg_index = 0; - // First argument is the LX6 instructions. + // First argument is the opcode. + run.set_arg(arg_index++, opcode); + + // Second argument is the LX6 instructions. run.set_arg(arg_index++, instr); - // Second argument is the number of LX6 instructions. + // Third argument is the number of LX6 instructions. run.set_arg(arg_index++, num_instr); // Copy descriptors from all sets to the end of the current segment for later diff --git a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc index f7d75be6a..d572dbe4c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc @@ -209,11 +209,11 @@ iree_status_t iree_hal_xrt_native_executable_create( std::make_unique(contexts[xclbin_index], entry_name); // XCL_BO_FLAGS_CACHEABLE is used to indicate that this is an instruction // buffer that resides in instr_memory. This buffer is always passed as - // the first argument to the kernel and we can use the - // kernel.group_id(/*index of first argument*/=0) to get the group_id. + // the second argument to the kernel and we can use the + // kernel.group_id(/*index of second argument*/=1) to get the group_id. instr = std::make_unique(device, num_instr * sizeof(uint32_t), XCL_BO_FLAGS_CACHEABLE, - kernel.get()->group_id(0)); + kernel.get()->group_id(1)); } catch (...) { iree_hal_executable_destroy((iree_hal_executable_t*)executable); IREE_TRACE_ZONE_END(z0); diff --git a/third_party/mlir-aie b/third_party/mlir-aie index 6f70bfe49..3ac9566f1 160000 --- a/third_party/mlir-aie +++ b/third_party/mlir-aie @@ -1 +1 @@ -Subproject commit 6f70bfe4904ec719042d7ebd8ded9ad8b31bb5b6 +Subproject commit 3ac9566f1da7c4ee6e81c263bc15d92aba7bcae7 diff --git a/third_party/mlir-air b/third_party/mlir-air index 766f50c77..b2df4d74a 160000 --- a/third_party/mlir-air +++ b/third_party/mlir-air @@ -1 +1 @@ -Subproject commit 766f50c7768dc9a12bb933f9ed45014a889d106e +Subproject commit b2df4d74a77e6d7be327e75802098eb96b5c9a35