Merge branch 'main' into docker_to_ghcr

onnx · Oct 18, 2024 · 9d891e1 · 9d891e1
2 parents b1c007c + 1435011
commit 9d891e1
Show file tree

Hide file tree

Showing 17 changed files with 574 additions and 74 deletions.
diff --git a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp
@@ -111,6 +111,8 @@ Value ZTensorHelper::getPreTransformedDescPtr(zdnn_data_types zDNNDataType,
   Type llvmZTensorDescStructTy = getZTensorDescStructTy(context);
   Value one = create.llvm.constant(llvmI64Ty, static_cast<int64_t>(1));
 
+  // Alloca is fine for LLVM structs; if we were to use alloc, we would also to
+  // manually insert free calls. So alloca makes total sense here.
   Value preTransformedDescPtr = create.llvm._alloca(
       krnl::getPointerType(context, llvmZTensorDescStructTy),
       llvmZTensorDescStructTy, one,

diff --git a/src/Conversion/KrnlToAffine/KrnlMatmul.cpp b/src/Conversion/KrnlToAffine/KrnlMatmul.cpp
@@ -372,13 +372,7 @@ class KrnlMatmulLowering : public ConversionPattern {
     assert(BUFFER_ALIGN >= gDefaultAllocAlign &&
            "alignment of buffers cannot be smaller than the default alignment "
            "(which is set for SIMD correctness");
-    // TODO: alloca is good as it help simplify away this data structures (as it
-    // is only used as local temp, basically extensions of registers). However,
-    // there might be issues with non-removed alloca when they are not in the
-    // innermost loop. Still think its worth it having alloca as we want
-    // eventually all the refs to alloca to be register/spill access, not memory
-    // load/stores.
-    Value TmpProd = create.mem.alignedAlloca(CTmpType, BUFFER_ALIGN);
+    Value TmpProd = create.mem.alignedAlloc(CTmpType, BUFFER_ALIGN);
     // Init with zero.
     Value fZero = create.math.constant(elementType, 0);
     Value vFZero = create.vec.broadcast(vecType, fZero);
@@ -455,13 +449,7 @@ class KrnlMatmulLowering : public ConversionPattern {
     // Have to privatize CTmpType by unroll factor (1 if none).
     MemRefType CTmpType = MemRefType::get({unrollFactor}, vecType);
     assert(BUFFER_ALIGN >= gDefaultAllocAlign);
-    // TODO: alloca is good as it help simplify away this data structures (as it
-    // is only used as local temp, basically extensions of registers). However,
-    // there might be issues with non-removed alloca when they are not in the
-    // innermost loop. Still think its worth it having alloca as we want
-    // eventually all the refs to alloca to be register/spill access, not memory
-    // load/stores.
-    Value TmpC = create.mem.alignedAlloca(CTmpType, BUFFER_ALIGN);
+    Value TmpC = create.mem.alignedAlloc(CTmpType, BUFFER_ALIGN);
 
     // Iterates over the I indices (j are simd dim).
     Value iSaved, kSaved;
@@ -473,7 +461,7 @@ class KrnlMatmulLowering : public ConversionPattern {
           MultiDialectBuilder<MathBuilder, VectorBuilder> create(createAffine);
           Value i = loopInd[0];
           iSaved = i; // Saved for unroll and jam.
-          // Alloca temp vector TmpC and save C(i)/0.0 into it.
+          // Alloc temp vector TmpC and save C(i)/0.0 into it.
           Value initVal = create.vec.loadIE(vecType, C, cStart, {i, iZero});
           Value tmpCAccess = (unrollFactor > 1) ? i : zeroIE.getValue();
           createAffine.store(initVal, TmpC, tmpCAccess);

diff --git a/src/Conversion/KrnlToLLVM/KrnlEntryPoint.cpp b/src/Conversion/KrnlToLLVM/KrnlEntryPoint.cpp
@@ -233,7 +233,7 @@ class KrnlEntryPointOpLowering : public OpRewritePattern<KrnlEntryPointOp> {
     // entry point instead of the wrapped static entry point.
     Type memRefOutTy = staticEntryPointFuncTy.getReturnTypes()[0];
     Type memRefOutPtrTy = getPointerType(context, memRefOutTy);
-    Value ptrToOutMemRef =
+    Value ptrToOutMemRef = // alloca ok as there is only one entry point.
         create.llvm._alloca(memRefOutPtrTy, memRefOutTy, one, /*alignment=*/0);
     staticInputs.emplace_back(ptrToOutMemRef);
 
@@ -250,7 +250,7 @@ class KrnlEntryPointOpLowering : public OpRewritePattern<KrnlEntryPointOp> {
       // Original input is shifted by 1 in the iface func.
       Type memRefInTy = typeConverter.convertType(origInputMemRefTypes[i - 1]);
       Type memRefInPtrTy = getPointerType(context, memRefInTy);
-      Value ptrToMemRef =
+      Value ptrToMemRef = // alloca ok as there is only one entry point.
           create.llvm._alloca(memRefInPtrTy, memRefInTy, one, /*alignment=*/0);
 
       // Fill in the memref underlying ptrToMemRef with information extracted
@@ -287,7 +287,8 @@ class KrnlEntryPointOpLowering : public OpRewritePattern<KrnlEntryPointOp> {
 
     Value numOutput = create.llvm.constant(
         int64Ty, static_cast<int64_t>(outMemRefList.size()));
-    // Assume that OMTensor pointer size is 8
+    // Assume that OMTensor pointer size is 8.
+    // Alloca ok as its only for 1 small data structure per parameters.
     Value outOmtPtrsArr = create.llvm._alloca(
         omTensorPtrAddrTy, opaquePtrTy, numOutput, /*alignment=*/0);
 

diff --git a/src/Conversion/ONNXToKrnl/Math/Gemm.cpp b/src/Conversion/ONNXToKrnl/Math/Gemm.cpp
@@ -87,6 +87,8 @@ struct ONNXGemmOpLowering : public OpConversionPattern<GemmOp> {
           MultiDialectBuilder<KrnlBuilder, MemRefBuilder, MathBuilder> create(
               createKrnl);
           // Create temp, single scalar, no need for default alignment.
+          // Alloca is ok here as its for a scalar, and in the generic version
+          // of GEMM.
           Value red = create.mem.alloca(MemRefType::get({}, elementType));
           // Set to zero.
           create.krnl.store(zeroVal, red);
@@ -203,14 +205,6 @@ struct ONNXGemmOpLowering : public OpConversionPattern<GemmOp> {
     MemRefType bTileType =
         MemRefType::get({kCacheTile, jCacheTile}, elementType);
     SmallVector<IndexExpr, 1> empty;
-    // Allocate here on heap, only when no parallelism.
-    Value aBuff, bBuff, rBuff;
-    if (!enableParallel) {
-      aBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN);
-      bBuff = create.mem.alignedAlloc(bTileType, BUFFER_ALIGN);
-      if (mustTileR)
-        rBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN);
-    }
 
     // 3) introduce the loops and permute them
     // I, J, K loop.
@@ -253,13 +247,10 @@ struct ONNXGemmOpLowering : public OpConversionPattern<GemmOp> {
           {I, J, K},
           [&](const KrnlBuilder &createKrnl, ValueRange i1_j1_indices) {
             Value i1(i1_j1_indices[0]), j1(i1_j1_indices[1]);
-            // If parallel, allocate on stack inside the parallel region.
-            if (enableParallel) {
-              aBuff = create.mem.alignedAlloca(aTileType, BUFFER_ALIGN);
-              bBuff = create.mem.alignedAlloca(bTileType, BUFFER_ALIGN);
-              if (mustTileR)
-                rBuff = create.mem.alignedAlloca(aTileType, BUFFER_ALIGN);
-            }
+            // If parallel, will stay inside, otherwise will migrate out.
+            Value aBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN);
+            Value bBuff = create.mem.alignedAlloc(bTileType, BUFFER_ALIGN);
+            Value rBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN);
             createKrnl.copyToBuffer(rBuff, R, {i1, j1}, zeroVal, false);
             createKrnl.iterateIE({}, {kk1}, {}, {},
                 [&](const KrnlBuilder &createKrnl, ValueRange k1_index) {
@@ -321,13 +312,9 @@ struct ONNXGemmOpLowering : public OpConversionPattern<GemmOp> {
           {J, K, I},
           [&](const KrnlBuilder &createKrnl, ValueRange j1_k1_indices) {
             Value j1(j1_k1_indices[0]), k1(j1_k1_indices[1]);
-            // If parallel, allocate on stack inside the parallel region.
-            if (enableParallel) {
-              aBuff = create.mem.alignedAlloca(aTileType, BUFFER_ALIGN);
-              bBuff = create.mem.alignedAlloca(bTileType, BUFFER_ALIGN);
-              if (mustTileR)
-                rBuff = create.mem.alignedAlloca(aTileType, BUFFER_ALIGN);
-            }
+            // If parallel, it will stay inside, otherwise it will migrate out.
+            Value aBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN);
+            Value bBuff = create.mem.alignedAlloc(bTileType, BUFFER_ALIGN);
             if (bTrans)
               createKrnl.copyToBuffer(bBuff, B, {j1, k1}, zeroVal, true);
             else

diff --git a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp
@@ -1063,7 +1063,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
 
   void genOneHorizontalSimdReduction(ConversionPatternRewriter &rewriter,
       MDBuilder &create, Operation *op, Type elementType, VectorType vecType,
-      Value tmpAlloca, Value flatInput, Value flatAlloc, Value initVec,
+      Value tmpAlloc, Value flatInput, Value flatAlloc, Value initVec,
       Value divisorForMean, ValueRange outLoopInd, Value simdUB, int64_t VL,
       bool simdOnly) const {
     IndexExpr lb = LitIE(0);
@@ -1076,7 +1076,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
         rewriter, create.getLoc(), elementType);
     create.krnl.simdReduceIE(lb, ub, VL, simdOnly,
         /* inputs*/ {flatInput}, {inputAF},
-        /* temp */ {tmpAlloca}, {tmpAF},
+        /* temp */ {tmpAlloc}, {tmpAF},
         /* output */ {flatAlloc}, {outputAF},
         /* init */ {identity},
         /* reduction simd/scalar */
@@ -1145,20 +1145,21 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
         onnxToKrnlParallelReport(
             op, true, 0, lbs[0], flatOutDims[0], "reduction h-simd");
       } else {
+        enableParallel = false;
         onnxToKrnlParallelReport(op, false, 0, lbs[0], flatOutDims[0],
             "not enough work for reduction h-simd");
       }
     }
     create.krnl.iterateIE(outLoopDef, outLoopDef, lbs, flatOutDims,
         [&](const KrnlBuilder &ck, ValueRange outLoopInd) {
           MDBuilder create(ck);
-          // Allocate temp inside loop (because of parallel).
-          Value tmpAlloca = create.mem.alignedAlloca(tmpType);
+          // When parallel, will stay inside; otherwise will migrate out.
+          Value tmpAlloc = create.mem.alignedAlloc(tmpType);
           Value identity = getIdentityValue<ONNXReductionOp>(
               rewriter, create.getLoc(), elementType);
           Value initVec = create.vec.splat(vecType, identity);
           genOneHorizontalSimdReduction(rewriter, create, op, elementType,
-              vecType, tmpAlloca, flatInput, flatAlloc, initVec, divisorForMean,
+              vecType, tmpAlloc, flatInput, flatAlloc, initVec, divisorForMean,
               outLoopInd, simdUB, VL, simdOnly);
         });
   }
@@ -1183,7 +1184,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
 
   void genVlHorizontalSimdReduction(ConversionPatternRewriter &rewriter,
       MDBuilder &create, Operation *op, Type elementType, VectorType vecType,
-      Value tmpBlockedAlloca, Value flatInput, Value flatAlloc, Value initVec,
+      Value tmpBlockedAlloc, Value flatInput, Value flatAlloc, Value initVec,
       Value divisorForMean, ValueRange blockedOutLoopInd,
       IndexExpr blockedCurrIndex, Value simdUB, int64_t VL,
       bool simdOnly) const {
@@ -1200,7 +1201,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
         rewriter, create.getLoc(), elementType);
     if (simdOnly) {
       create.affine.simdReduce2DIE(
-          lb, ub, VL, simdOnly, flatInput, inputAF, tmpBlockedAlloca, tmpAF,
+          lb, ub, VL, simdOnly, flatInput, inputAF, tmpBlockedAlloc, tmpAF,
           flatAlloc, outputAF, identity,
           [&](const AffineBuilder &b, Value inputVal, Value tmpVal,
               int64_t VL) {
@@ -1215,7 +1216,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
           });
     } else {
       create.scf.simdReduce2DIE( // Affine fails with dynamic shapes.
-          lb, ub, VL, simdOnly, flatInput, inputAF, tmpBlockedAlloca, tmpAF,
+          lb, ub, VL, simdOnly, flatInput, inputAF, tmpBlockedAlloc, tmpAF,
           flatAlloc, outputAF, identity,
           [&](const SCFBuilder &b, Value inputVal, Value tmpVal, int64_t VL) {
             Type type = VL > 1 ? vecType : elementType;
@@ -1298,15 +1299,16 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
         onnxToKrnlParallelReport(op, true, parId, lbs[parId],
             flatOutDims[parId], "reduction shuffle h-simd");
       } else {
+        enableParallel = false;
         onnxToKrnlParallelReport(op, false, 0, lbs[0], flatOutDims[0],
             "not enough work for reduction shuffle h-simd");
       }
     }
     create.krnl.iterateIE(outLoopDef, optimizedOutLoopDef, lbs, flatOutDims,
         [&](const KrnlBuilder &ck, ValueRange blockedOutLoopInd) {
           MDBuilder create(ck);
-          // Create temp inside loop (because of parallel).
-          Value tmpBlockedAlloca = create.mem.alignedAlloca(tmpBlockedType);
+          // When parallel, will stay inside; otherwise will migrate out.
+          Value tmpBlockedAlloc = create.mem.alignedAlloc(tmpBlockedType);
           Value identity = getIdentityValue<ONNXReductionOp>(
               rewriter, create.getLoc(), elementType);
           Value initVec = create.vec.splat(vecType, identity);
@@ -1336,7 +1338,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
                       outLoopInd.emplace_back(blockLocalInd);
                       // Perform reduction for one output value.
                       genOneHorizontalSimdReduction(rewriter, create, op,
-                          elementType, vecType, tmpBlockedAlloca, flatInput,
+                          elementType, vecType, tmpBlockedAlloc, flatInput,
                           flatAlloc, initVec, divisorForMean, outLoopInd,
                           simdUB, VL, simdOnly);
                     }); /* for inside blocked loop */
@@ -1345,7 +1347,7 @@ struct ONNXReductionOpLowering : public OpConversionPattern<ONNXReductionOp> {
                 MDBuilder create(scf);
                 // create.krnl.printf("full tile\n");
                 genVlHorizontalSimdReduction(rewriter, create, op, elementType,
-                    vecType, tmpBlockedAlloca, flatInput, flatAlloc, initVec,
+                    vecType, tmpBlockedAlloc, flatInput, flatAlloc, initVec,
                     divisorForMean, blockedOutLoopInd, blockedCurrIndex, simdUB,
                     VL, simdOnly);
               });

diff --git a/src/Conversion/ONNXToKrnl/NN/Normalization.cpp b/src/Conversion/ONNXToKrnl/NN/Normalization.cpp
@@ -193,6 +193,7 @@ struct ONNXInstanceNormalizationOpLowering
     create.krnlIE.getShapeAsSymbols(inputMemRef, inputBounds);
     MemRefType tmpType = MemRefType::get({}, elementType);
     Value fZero = create.math.constant(elementType, 0);
+    // Ok to use alloca, just one scalar.
     Value tmpMemRef = create.mem.alloca(tmpType);
 
     // Compute the number of values in a single channel: product of spatial
@@ -957,12 +958,21 @@ struct GenericLayerNormaOpLowering : public OpConversionPattern<OP_TYPE> {
     } else {
       onnxToKrnlParallelReport(op, false, -1, -1, "no parallel in layer norm");
     }
+    Value tmpRedMemRef, tmpRedMemRef2;
+    if (!useParallel) {
+      // Sequential, alloc before loop.
+      tmpRedMemRef = create.mem.alignedAlloc(tmpRedType);
+      tmpRedMemRef2 = create.mem.alignedAlloc(tmpRedType);
+    }
     create.krnl.forLoopIE(LitIE(0), XFlatDims[0], /*step*/ B, useParallel,
         [&](const KrnlBuilder &ck, ValueRange blockedLoopIndices) {
           MDBuilder create(ck);
           IndexExprScope innerScope(ck);
-          Value tmpRedMemRef = create.mem.alignedAlloca(tmpRedType);
-          Value tmpRedMemRef2 = create.mem.alignedAlloca(tmpRedType);
+          if (useParallel) {
+            // Parallel, alloc inside parallel loop.
+            tmpRedMemRef = create.mem.alignedAlloc(tmpRedType);
+            tmpRedMemRef2 = create.mem.alignedAlloc(tmpRedType);
+          }
           IndexExpr blockedCurrIndex = DimIE(blockedLoopIndices[0]);
           IndexExpr blockedUB = SymIE(XFlatDims[0]);
           IndexExpr isFull =

diff --git a/src/Conversion/ONNXToKrnl/NN/Pooling.cpp b/src/Conversion/ONNXToKrnl/NN/Pooling.cpp
@@ -310,7 +310,7 @@ struct ONNXPoolOpLowering : public OpConversionPattern<PoolOp> {
     // Identity value of the operation.
     auto identity = getIdentityValue<PoolOp>(rewriter, loc, outputElementType);
     // Create a local reduction value for output[n][c][ho][wo].
-    // Single scalar, no need for default alignment.
+    // Single scalar, no need for default alignment. Ok to use alloca.
     Value reductionVal =
         create.mem.alloca(MemRefType::get({}, memRefType.getElementType()));
 

diff --git a/src/Conversion/ONNXToKrnl/ObjectDetection/NonMaxSuppression.cpp b/src/Conversion/ONNXToKrnl/ObjectDetection/NonMaxSuppression.cpp
@@ -110,6 +110,7 @@ static void suppressByScores(ConversionPatternRewriter &rewriter, Location loc,
   Value zero = create.math.constantIndex(0);
   Value one = create.math.constantIndex(1);
   // Store the number of scores whose value is greater than the threshold.
+  // Scalar, ok to use alloca.
   Value topk = create.mem.alloca(MemRefType::get({}, indexType));
 
   // Compute the effective max output per class.
@@ -272,6 +273,7 @@ struct ONNXNonMaxSuppressionOpLowering
 
     // Refine the number of output boxes per class by suppressing it using
     // spatial dimension size and score threshold.
+    // Scalar, ok to use alloca.
     Value maxOutputPerClass = create.mem.alloca(MemRefType::get({}, indexType));
     // 1. Suppress by using spatial dimension size.
     Value x = create.math.castToIndex(maxOutputBoxPerClass);
@@ -312,6 +314,7 @@ struct ONNXNonMaxSuppressionOpLowering
     // dim of the output, which is suppressed by IOU during computation and
     // cannot be computed in advance.
     // Final output shape : [effective_num_selected_indices, 3]
+    // Scalar, ok to use alloca.
     Value effectiveNumSelectedIndices =
         create.mem.alloca(MemRefType::get({}, indexType));
     create.krnl.store(zero, effectiveNumSelectedIndices);

diff --git a/src/Conversion/ONNXToKrnl/Tensor/Compress.cpp b/src/Conversion/ONNXToKrnl/Tensor/Compress.cpp
@@ -55,6 +55,7 @@ struct ONNXCompressOpLowering : public OpConversionPattern<ONNXCompressOp> {
     // Create temp memory for summing up the true value and init to zero.
     Type indexType = rewriter.getIndexType();
     MemRefType indexMemRefType = MemRefType::get({}, indexType);
+    // Scalar, ok to use alloca.
     Value sumMemRef = create.mem.alloca(indexMemRefType);
     create.krnl.store(zeroIE.getValue(), sumMemRef);
     // Now create a loop to iterate over all conditions.
@@ -142,6 +143,7 @@ struct ONNXCompressOpLowering : public OpConversionPattern<ONNXCompressOp> {
         }
       }
 
+      // Scalar, ok to use alloca.
       Value readIndexMemRef = create.mem.alloca(indexMemRefType);
       create.krnl.store(zeroIE.getValue(), readIndexMemRef);
 

diff --git a/src/Conversion/ONNXToKrnl/Tensor/GatherND.cpp b/src/Conversion/ONNXToKrnl/Tensor/GatherND.cpp
@@ -122,6 +122,7 @@ struct ONNXGatherNDOpLowering : public OpConversionPattern<ONNXGatherNDOp> {
     // Initialize the index used to store the result values.
     Value iZero = create.math.constantIndex(0);
     Value iOne = create.math.constantIndex(1);
+    // Scalar, ok to use alloca.
     Value storeIndex =
         create.mem.alloca(MemRefType::get({}, rewriter.getIndexType()));
     create.krnl.store(iZero, storeIndex);

diff --git a/src/Conversion/ONNXToKrnl/Tensor/NonZero.cpp b/src/Conversion/ONNXToKrnl/Tensor/NonZero.cpp
@@ -112,6 +112,7 @@ struct ONNXNonZeroOpLowering : public OpConversionPattern<ONNXNonZeroOp> {
     create.krnlIE.getShapeAsDims(X, xUbs);
 
     // Emit a variable for the total number of nonzero values.
+    // Scalar, ok to use alloca.
     Value nonzeroCount = create.mem.alloca(MemRefType::get({}, indexTy));
     create.krnl.store(iZero, nonzeroCount);
 
@@ -176,6 +177,7 @@ struct ONNXNonZeroOpLowering : public OpConversionPattern<ONNXNonZeroOp> {
     //   out[0][i] = p
     // ```
 
+    // Scalars, ok to use alloca.
     Value pos = create.mem.alloca(MemRefType::get({}, indexTy));
     Value sum = create.mem.alloca(MemRefType::get({}, indexTy));
     ValueRange iLoopDef = create.krnl.defineLoops(1);

diff --git a/src/Conversion/ONNXToKrnl/Tensor/Unique.cpp b/src/Conversion/ONNXToKrnl/Tensor/Unique.cpp
@@ -125,6 +125,7 @@ struct ONNXUniqueOpLowering : public ConversionPattern {
     //
     Type indexTy = rewriter.getIndexType();
     Value iZero = create.math.constantIndex(0);
+    // Scalar, ok to use alloca.
     Value uniqueCount = create.mem.alloca(MemRefType::get({}, indexTy));
     create.krnl.store(iZero, uniqueCount);
     Value noneValue;

diff --git a/src/Dialect/Mlir/DialectBuilder.hpp b/src/Dialect/Mlir/DialectBuilder.hpp
@@ -330,6 +330,9 @@ struct MemRefBuilder final : DialectBuilder {
   // currently executing function, to be automatically released when this
   // function returns to its caller. It is strongly suggested to place alloca
   // instructions outside of a loop.
+  //
+  // When possible, DO NOT USE ALLOCA except for a few scalars.
+  //
   mlir::memref::AllocaOp alloca(mlir::MemRefType type) const;
   mlir::memref::AllocaOp alignedAlloca(
       mlir::MemRefType type, int64_t align = defaultAlign) const;