diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp index 29db63969..076fda47d 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEConvertToDma.cpp @@ -251,6 +251,67 @@ LogicalResult processInputs(Operation *op, SmallVector &offsets, return success(); } +LogicalResult packL3ToL2(IREE::LinalgExt::PackOp packOp, + SmallVector &offsets, + SmallVector &sizes, + SmallVector &strides) { + MLIRContext *ctx = packOp.getContext(); + + llvm::ArrayRef permutation = packOp.getOuterDimsPerm(); + llvm::ArrayRef innerTiles = packOp.getStaticInnerTiles(); + + SmallVector innerSizes; + SmallVector innerStrides; + SmallVector innerOffsets; + auto innerDimsPos = packOp.getInnerDimsPos(); + + int numOuterDims = sizes.size() - innerTiles.size(); + SmallVector outerOffsets = SmallVector( + offsets.begin(), offsets.begin() + numOuterDims); + SmallVector outerStrides = SmallVector( + strides.begin(), strides.begin() + numOuterDims); + SmallVector outerSizes = + SmallVector(sizes.begin(), sizes.begin() + numOuterDims); + + // Apply inverse permutation to the outer dims if permutation provided (if + // permutation not provided, it is identity, and therefore so is the inverse). + if (!permutation.empty()) { + SmallVector inversePermutation = + invertPermutationVector(permutation); + applyPermutationToVector(outerStrides, inversePermutation); + applyPermutationToVector(outerSizes, inversePermutation); + applyPermutationToVector(outerOffsets, inversePermutation); + } + // Do the unpacking on the Outer dims. + llvm::SmallDenseMap outerDimsIndexMap; + // Intialize the indexing of each outer dim. + for (int i = 0; i < numOuterDims; i++) { + outerDimsIndexMap[i] = i; + } + for (int i = 0; i < innerTiles.size(); i++) { + // Insert inner dims adjacent to there corresponding outer dims. + outerSizes.insert( + outerSizes.begin() + outerDimsIndexMap[innerDimsPos[i]] + 1, + getAsIndexOpFoldResult(ctx, innerTiles[i])); + outerStrides.insert( + outerStrides.begin() + outerDimsIndexMap[innerDimsPos[i]] + 1, + strides[numOuterDims + i]); + outerOffsets.insert( + outerOffsets.begin() + outerDimsIndexMap[innerDimsPos[i]] + 1, + offsets[numOuterDims + i]); + // Update the map as all the dimensions inner to the innerDimsPos[i] are now + // shifted by 1. + for (int j = innerDimsPos[i] + 1; j < numOuterDims; j++) { + outerDimsIndexMap[j]++; + } + } + // Make the outer dims as the final returned dims + offsets = outerOffsets; + strides = outerStrides; + sizes = outerSizes; + return success(); +} + /// Rewrite the pack/unpack op 'op' as a DMA operation. The function arguments /// 'input', 'output', and 'innerTiles' are the input, output, and inner tile /// of 'op'. If 'op' is not a pack/unpack op, or if it determined to not @@ -283,10 +344,6 @@ LogicalResult rewriteAsDma(IRRewriter &rewriter, Operation *op, Value input, return failure(); } - if (!succeeded(processInputs(op, srcOffsets, srcShape, srcBaseStrides))) { - return failure(); - } - // Prepare destination DMA inputs. SmallVector dstOffsets; SmallVector dstBaseStrides; @@ -295,6 +352,23 @@ LogicalResult rewriteAsDma(IRRewriter &rewriter, Operation *op, Value input, return failure(); } + uint32_t srcMemspace = + cast(input.getType()).getMemorySpaceAsInt(); + uint32_t dstMemspace = + cast(output.getType()).getMemorySpaceAsInt(); + + if (auto packOp = dyn_cast(op) && srcMemspace == 0 && + dstMemspace == 1) { + if (!succeeded(packL3ToL2(dyn_cast(op), dstOffsets, + dstShape, dstBaseStrides))) { + return failure(); + } + } else { + if (!succeeded(processInputs(op, srcOffsets, srcShape, srcBaseStrides))) { + return failure(); + } + } + // Create logical objectFifos from source and destination memrefs. Value srcVal = sourceOp->getResult(0); Value dstVal = dstOp->getResult(0); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp index 949a1c007..0c14a7b02 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIEDmaLoopSubsumption.cpp @@ -479,6 +479,24 @@ struct SubsumeLoopIntoDMA return false; }; + auto circularUsersInSameScope = + [&](Value result, + SmallVector users) -> bool { + bool currentUser = false; + for (AMDAIE::DoublyStridedOpInterface userOp : llvm::reverse(users)) { + if (isa(userOp) && + userOp != op.getOperation()) { + return true; + } + if (userOp == op.getOperation()) { + currentUser = true; + continue; + } + if (currentUser) return true; + } + return false; + }; + uint8_t sourceMemspaceInt; uint8_t targetMemspaceInt; if (auto npuDmaOp = dyn_cast(op.getOperation())) { @@ -525,7 +543,17 @@ struct SubsumeLoopIntoDMA return rewriter.notifyMatchFailure( op, "should operate on an `amdaie.connection` op"); } - if (hasUsersInSameScope(connectionOp.getResult())) { + // Walk the parentOp and get users of the connection op in order. + Value dma = npuCircularDmaOp.getConnection(); + SmallVector dmaUsers; + parentOp->walk([&](AMDAIE::DoublyStridedOpInterface op) { + auto connection = dyn_cast_if_present( + op->getOperand(0).getDefiningOp()); + if (connection == npuCircularDmaOp.getConnectionOp()) { + dmaUsers.push_back(op); + } + }); + if (circularUsersInSameScope(dma, dmaUsers)) { return rewriter.notifyMatchFailure( op, "Has users of same DMA in scope, analysis to check validity of " diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 0fc050ffc..005297b58 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -619,6 +619,8 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEDmaCompositionPass()); + passManager.addPass(createAMDAIECanonicalizeDoublyStridedOpPass()); + //passManager.addPass(createAMDAIEDmaCompositionPass()); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); passManager.addPass(createAMDAIEDmaCSEPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt index f4004a9a5..fa789b9fe 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/CMakeLists.txt @@ -56,7 +56,6 @@ iree_lit_test_suite( "pack_and_transpose_level1.mlir" "pack_and_transpose_level2.mlir" "pack_to_air.mlir" - "convert_to_dma.mlir" "convert_to_dma_failures.mlir" "pad.mlir" "peel_for_loop.mlir"