From a2230f187762c0022fba307edcc5af61b6cb5d75 Mon Sep 17 00:00:00 2001 From: Alexandre Eichenberger Date: Thu, 17 Oct 2024 22:29:00 -0400 Subject: [PATCH] responding to comments Signed-off-by: Alexandre Eichenberger --- .../ZLowToLLVM/ZLowToLLVMCommon.cpp | 6 +- src/Conversion/ONNXToKrnl/Math/Gemm.cpp | 25 +-- src/Conversion/ONNXToKrnl/Math/Reduction.cpp | 22 +-- .../Math/Reduction_with_canonicalize_O3.mlir | 43 ++--- .../Normalization_O3_SIMD_canonicalize.mlir | 164 +++++++++--------- 5 files changed, 119 insertions(+), 141 deletions(-) diff --git a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp index d60563b3af..4ffcdc6baa 100644 --- a/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp +++ b/src/Accelerators/NNPA/Conversion/ZLowToLLVM/ZLowToLLVMCommon.cpp @@ -111,7 +111,8 @@ Value ZTensorHelper::getPreTransformedDescPtr(zdnn_data_types zDNNDataType, Type llvmZTensorDescStructTy = getZTensorDescStructTy(context); Value one = create.llvm.constant(llvmI64Ty, static_cast(1)); - // TODO: evaluate if a heap alloc would not be better. + // Alloca is fine for LLVM structs; if we were to use alloc, we would also to + // manually insert free calls. So alloca makes total sense here. Value preTransformedDescPtr = create.llvm._alloca( krnl::getPointerType(context, llvmZTensorDescStructTy), llvmZTensorDescStructTy, one, @@ -155,7 +156,6 @@ Value ZTensorHelper::getTransformedDescPtr( Type llvmZTensorDescStructTy = getZTensorDescStructTy(context); Value one = create.llvm.constant(llvmI64Ty, static_cast(1)); - // TODO: evaluate if a heap alloc would not be better. Value transformedDescPtr = create.llvm._alloca( krnl::getPointerType(context, llvmZTensorDescStructTy), llvmZTensorDescStructTy, one, @@ -217,7 +217,6 @@ ZTensor ZTensorHelper::getZTensor(Value bufferPtr, zdnn_data_types dataType, Value transformedDescPtr = getTransformedDescPtr(preTransformedDescPtr, isConcat, concatInfo); // Create the input zTensor. - // TODO: evaluate if a heap alloc would not be better. Value alloc = create.llvm._alloca(krnl::getPointerType(context, llvmZTensorStructTy), llvmZTensorStructTy, one, @@ -253,7 +252,6 @@ ZTensor ZTensorHelper::getZTensor(Value preTransformedDescPtr, Type llvmZTensorStructTy = getZTensorStructTy(context); Value one = create.llvm.constant(rewriter.getI64Type(), static_cast(1)); - // TODO: evaluate if a heap alloc would not be better. Value alloc = create.llvm._alloca(krnl::getPointerType(context, llvmZTensorStructTy), llvmZTensorStructTy, one, diff --git a/src/Conversion/ONNXToKrnl/Math/Gemm.cpp b/src/Conversion/ONNXToKrnl/Math/Gemm.cpp index c6424360fc..af0724c446 100644 --- a/src/Conversion/ONNXToKrnl/Math/Gemm.cpp +++ b/src/Conversion/ONNXToKrnl/Math/Gemm.cpp @@ -205,14 +205,6 @@ struct ONNXGemmOpLowering : public OpConversionPattern { MemRefType bTileType = MemRefType::get({kCacheTile, jCacheTile}, elementType); SmallVector empty; - // Allocate here on heap, only when no parallelism. - Value aBuff, bBuff, rBuff; - if (!enableParallel) { - aBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN); - bBuff = create.mem.alignedAlloc(bTileType, BUFFER_ALIGN); - if (mustTileR) - rBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN); - } // 3) introduce the loops and permute them // I, J, K loop. @@ -255,11 +247,10 @@ struct ONNXGemmOpLowering : public OpConversionPattern { {I, J, K}, [&](const KrnlBuilder &createKrnl, ValueRange i1_j1_indices) { Value i1(i1_j1_indices[0]), j1(i1_j1_indices[1]); - if (enableParallel) { - aBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN); - bBuff = create.mem.alignedAlloc(bTileType, BUFFER_ALIGN); - rBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN); - } + // If parallel, will stay inside, otherwise will migrate out. + Value aBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN); + Value bBuff = create.mem.alignedAlloc(bTileType, BUFFER_ALIGN); + Value rBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN); createKrnl.copyToBuffer(rBuff, R, {i1, j1}, zeroVal, false); createKrnl.iterateIE({}, {kk1}, {}, {}, [&](const KrnlBuilder &createKrnl, ValueRange k1_index) { @@ -321,11 +312,9 @@ struct ONNXGemmOpLowering : public OpConversionPattern { {J, K, I}, [&](const KrnlBuilder &createKrnl, ValueRange j1_k1_indices) { Value j1(j1_k1_indices[0]), k1(j1_k1_indices[1]); - // If parallel, allocate on stack inside the parallel region. - if (enableParallel) { - aBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN); - bBuff = create.mem.alignedAlloc(bTileType, BUFFER_ALIGN); - } + // If parallel, it will stay inside, otherwise it will migrate out. + Value aBuff = create.mem.alignedAlloc(aTileType, BUFFER_ALIGN); + Value bBuff = create.mem.alignedAlloc(bTileType, BUFFER_ALIGN); if (bTrans) createKrnl.copyToBuffer(bBuff, B, {j1, k1}, zeroVal, true); else diff --git a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp index 03f0672abd..2e3892324e 100644 --- a/src/Conversion/ONNXToKrnl/Math/Reduction.cpp +++ b/src/Conversion/ONNXToKrnl/Math/Reduction.cpp @@ -1150,18 +1150,11 @@ struct ONNXReductionOpLowering : public OpConversionPattern { "not enough work for reduction h-simd"); } } - Value tmpAlloc; - if (!enableParallel) { - // No parallel, alloc once outside. - tmpAlloc = create.mem.alignedAlloc(tmpType); - } create.krnl.iterateIE(outLoopDef, outLoopDef, lbs, flatOutDims, [&](const KrnlBuilder &ck, ValueRange outLoopInd) { MDBuilder create(ck); - if (enableParallel) { - // Allocate temp inside loop because of parallel. - tmpAlloc = create.mem.alignedAlloc(tmpType); - } + // When parallel, will stay inside; otherwise will migrate out. + Value tmpAlloc = create.mem.alignedAlloc(tmpType); Value identity = getIdentityValue( rewriter, create.getLoc(), elementType); Value initVec = create.vec.splat(vecType, identity); @@ -1311,18 +1304,11 @@ struct ONNXReductionOpLowering : public OpConversionPattern { "not enough work for reduction shuffle h-simd"); } } - Value tmpBlockedAlloc; - if (!enableParallel) { - // Sequential, can allocate before loop. - tmpBlockedAlloc = create.mem.alignedAlloc(tmpBlockedType); - } create.krnl.iterateIE(outLoopDef, optimizedOutLoopDef, lbs, flatOutDims, [&](const KrnlBuilder &ck, ValueRange blockedOutLoopInd) { MDBuilder create(ck); - if (enableParallel) { - // Create temp inside loop because of parallel. - tmpBlockedAlloc = create.mem.alignedAlloc(tmpBlockedType); - } + // When parallel, will stay inside; otherwise will migrate out. + Value tmpBlockedAlloc = create.mem.alignedAlloc(tmpBlockedType); Value identity = getIdentityValue( rewriter, create.getLoc(), elementType); Value initVec = create.vec.splat(vecType, identity); diff --git a/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_canonicalize_O3.mlir b/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_canonicalize_O3.mlir index aba9128ee3..9a18e44b77 100644 --- a/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_canonicalize_O3.mlir +++ b/test/mlir/conversion/onnx_to_krnl/Math/Reduction_with_canonicalize_O3.mlir @@ -320,9 +320,9 @@ func.func private @gpt2_original(%arg0 : tensor) -> tensor, memref<2xindex>) -> memref // CHECK-DAG: [[LOOP_0_:%.+]]:2 = krnl.define_loops 2 // CHECK: [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]]#1 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) -// CHECK: [[RES_2_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: krnl.iterate([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to [[VAR_dim_]], [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to [[MAP_0_]](){{.}}[[VAR_dim_]], [[VAR_dim_]]_0]){ -// CHECK: [[VAR_7_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[VAR_7_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[RES_2_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: [[VAR_8_:%.+]] = affine.apply [[MAP_1_]]([[VAR_7_]]#1){{.}}[[VAR_dim_0_]]{{.}} // CHECK: [[VAR_9_:%.+]] = arith.cmpi slt, [[VAR_8_]], [[CST_0_]] : index // CHECK: scf.if [[VAR_9_]] { @@ -431,9 +431,9 @@ func.func private @gpt2_no_keepdims(%arg0 : tensor) -> tensor<*xf32 // CHECK-DAG: [[VAR_5_:%.+]] = arith.sitofp [[VAR_4_]] : i64 to f32 // CHECK-DAG: [[LOOP_0_:%.+]]:2 = krnl.define_loops 2 // CHECK: [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]]#1 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) -// CHECK: [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: krnl.iterate([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to [[VAR_dim_]], [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to [[MAP_0_]](){{.}}[[VAR_dim_]], [[VAR_dim_]]_0]){ -// CHECK: [[VAR_7_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[VAR_7_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: [[VAR_8_:%.+]] = affine.apply [[MAP_1_]]([[VAR_7_]]#1){{.}}[[VAR_dim_0_]]{{.}} // CHECK: [[VAR_9_:%.+]] = arith.cmpi slt, [[VAR_8_]], [[CST_0_]] : index // CHECK: scf.if [[VAR_9_]] { @@ -553,9 +553,9 @@ func.func private @gpt2_reduce2(%arg0 : tensor) -> tensor<*xf32> { // CHECK-DAG: [[VAR_reshape_7_:%.+]] = memref.reshape [[RES_]]([[RES_]]_6) : (memref, memref<2xindex>) -> memref // CHECK-DAG: [[LOOP_0_:%.+]]:2 = krnl.define_loops 2 // CHECK: [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]]#1 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) -// CHECK: [[RES_3_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: krnl.iterate([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to [[VAR_dim_]], [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to [[MAP_0_]](){{.}}[[VAR_dim_]], [[VAR_dim_]]_0]){ -// CHECK: [[VAR_7_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[VAR_7_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[RES_3_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: [[VAR_8_:%.+]] = affine.apply [[MAP_1_]]([[VAR_7_]]#1){{.}}[[VAR_dim_0_]]{{.}} // CHECK: [[VAR_9_:%.+]] = arith.cmpi slt, [[VAR_8_]], [[CST_0_]] : index // CHECK: scf.if [[VAR_9_]] { @@ -677,9 +677,9 @@ func.func private @gpt2_one_not_multiple(%arg0 : tensor) -> tensor // CHECK-DAG: [[VAR_reshape_7_:%.+]] = memref.reshape [[RES_]]([[RES_]]_6) : (memref, memref<2xindex>) -> memref // CHECK-DAG: [[LOOP_0_:%.+]]:2 = krnl.define_loops 2 // CHECK: [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]]#1 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) -// CHECK: [[RES_3_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: krnl.iterate([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to [[VAR_dim_]], [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to [[MAP_0_]](){{.}}[[VAR_dim_]], [[VAR_dim_]]_0]){ -// CHECK: [[VAR_7_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[VAR_7_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[RES_3_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: [[VAR_8_:%.+]] = affine.apply [[MAP_1_]]([[VAR_7_]]#1){{.}}[[VAR_dim_0_]]{{.}} // CHECK: [[VAR_9_:%.+]] = arith.cmpi slt, [[VAR_8_]], [[CST_0_]] : index // CHECK: scf.if [[VAR_9_]] { @@ -802,9 +802,9 @@ func.func private @gpt2_no_simd_as_not_mult_of_VL(%arg0 : tensor) // CHECK-DAG: [[VAR_reshape_7_:%.+]] = memref.reshape [[RES_]]([[RES_]]_6) : (memref, memref<2xindex>) -> memref // CHECK-DAG: [[LOOP_0_:%.+]]:2 = krnl.define_loops 2 // CHECK: [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]]#1 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) -// CHECK: [[RES_3_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: krnl.iterate([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to [[VAR_dim_]], [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to [[MAP_0_]](){{.}}[[VAR_dim_]], [[VAR_dim_]]_0]){ -// CHECK: [[VAR_7_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[VAR_7_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[RES_3_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: [[VAR_8_:%.+]] = affine.apply [[MAP_1_]]([[VAR_7_]]#1){{.}}[[VAR_dim_0_]]{{.}} // CHECK: [[VAR_9_:%.+]] = arith.cmpi slt, [[VAR_8_]], [[CST_0_]] : index // CHECK: scf.if [[VAR_9_]] { @@ -922,9 +922,10 @@ func.func private @test_reducemax_v13_bis(%arg0 : tensor<1028x256xf32>) -> tenso // CHECK-DAG: [[RES_:%.+]] = memref.alloc() {{.*}}: memref<1028xf32> // CHECK-DAG: [[LOOP_0_:%.+]] = krnl.define_loops 1 // CHECK: [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]] 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) -// CHECK: [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: krnl.iterate([[BLOCK_TILE__0_]]) with ([[LOOP_0_]] -> [[I_0_:%.+]] = 0 to 1028){ -// CHECK: [[VAR_1_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__0_]]) : (!krnl.loop) -> index +// CHECK-DAG: [[VAR_1_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__0_]]) : (!krnl.loop) -> index +// CHECK-DAG: [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> +// CHECK-NOT: separator of consecutive DAGs // CHECK-DAG: [[VAR_2_:%.+]] = affine.apply [[MAP_0_]]([[VAR_1_]]) // CHECK-DAG: [[VAR_3_:%.+]] = affine.apply [[MAP_1_]]([[VAR_1_]]) // CHECK-DAG: [[VAR_4_:%.+]] = affine.apply [[MAP_2_]]([[VAR_1_]]) @@ -995,9 +996,9 @@ func.func private @test_reducemax_v13_small(%arg0 : tensor<7x8xf32>) -> tensor<* // CHECK-DAG: [[RES_:%.+]] = memref.alloc() {{.*}}: memref<7xf32> // CHECK-DAG: [[LOOP_0_:%.+]] = krnl.define_loops 1 // CHECK: [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]] 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) -// CHECK: [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: krnl.iterate([[BLOCK_TILE__0_]]) with ([[LOOP_0_]] -> [[I_0_:%.+]] = 0 to 7){ -// CHECK: [[VAR_1_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__0_]]) : (!krnl.loop) -> index +// CHECK-DAG: [[VAR_1_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__0_]]) : (!krnl.loop) -> index +// CHECK-DAG: [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: [[VAR_2_:%.+]] = affine.apply [[MAP_0_]]([[VAR_1_]]) // CHECK: [[VAR_3_:%.+]] = arith.cmpi slt, [[VAR_2_]], [[CST_0_]] : index // CHECK: scf.if [[VAR_3_]] { @@ -1079,9 +1080,9 @@ func.func private @test_reducemax_int_v13(%arg0 : tensor<128x256x768xi32>) -> te // CHECK-DAG: [[CST_0_:%.+]] = arith.constant 0 : index // CHECK-DAG: [[RES_:%.+]] = memref.alloc() {{.*}}: memref<128x256xi32> // CHECK-DAG: [[LOOP_0_:%.+]]:2 = krnl.define_loops 2 -// CHECK-DAG: [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<1x32xi32> // CHECK: krnl.iterate([[LOOP_0_]]#0, [[LOOP_0_]]#1) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to 128, [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to 256){ -// CHECK: [[VAR_1_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[LOOP_0_]]#1) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[VAR_1_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[LOOP_0_]]#1) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[RES_1_:%.+]] = memref.alloc() {{.*}}: memref<1x32xi32> // CHECK: vector.store [[VAR_cst_]], [[RES_1_]]{{.}}[[CST_0_]], [[CST_0_]]{{.}} : memref<1x32xi32>, vector<32xi32> // CHECK: [[LOOP_1_:%.+]] = krnl.define_loops 1 // CHECK: [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_1_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) @@ -1138,9 +1139,10 @@ func.func private @bertsquad10_same_pattern(%arg0 : tensor) -> te // CHECK-DAG: [[VAR_reshape_:%.+]] = memref.reshape [[RES_]]([[RES_]]_1) : (memref, memref<2xindex>) -> memref // CHECK-DAG: [[LOOP_0_:%.+]]:2 = krnl.define_loops 2 // CHECK: [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]]#1 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) -// CHECK: [[RES_2_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: krnl.iterate([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to [[VAR_dim_]], [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to 256){ -// CHECK: [[VAR_6_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[VAR_6_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[RES_2_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> +// CHECK-NOT: separator of consecutive DAGs // CHECK-DAG: [[VAR_7_:%.+]] = affine.apply [[MAP_2_]]([[VAR_6_]]#1) // CHECK-DAG: [[VAR_8_:%.+]] = affine.apply [[MAP_3_]]([[VAR_6_]]#1) // CHECK-DAG: [[VAR_9_:%.+]] = affine.apply [[MAP_4_]]([[VAR_6_]]#1) @@ -1220,9 +1222,10 @@ func.func private @bertsquad10_const_pattern(%arg0 : tensor<1x256x768xf32>) -> t // CHECK-DAG: [[VAR_reshape_:%.+]] = memref.reshape [[RES_]]([[RES_]]_1) : (memref<1x256x1xf32>, memref<2xindex>) -> memref<1x256xf32> // CHECK-DAG: [[LOOP_0_:%.+]]:2 = krnl.define_loops 2 // CHECK: [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]]#1 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) -// CHECK: [[RES_2_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: krnl.iterate([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to 1, [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to 256){ -// CHECK: [[VAR_1_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[VAR_1_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[RES_2_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> +// CHECK-NOT: separator of consecutive DAGs // CHECK-DAG: [[VAR_2_:%.+]] = affine.apply [[MAP_0_]]([[VAR_1_]]#1) // CHECK-DAG: [[VAR_3_:%.+]] = affine.apply [[MAP_1_]]([[VAR_1_]]#1) // CHECK-DAG: [[VAR_4_:%.+]] = affine.apply [[MAP_2_]]([[VAR_1_]]#1) diff --git a/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_canonicalize.mlir b/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_canonicalize.mlir index 7aef01ecbb..5f075f9626 100644 --- a/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_canonicalize.mlir +++ b/test/mlir/conversion/onnx_to_krnl/NN/Normalization_O3_SIMD_canonicalize.mlir @@ -63,9 +63,10 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK-DAG: [[VAR_reshape_4_:%.+]] = memref.reshape [[RES_]]([[RES_]]_3) : (memref<2x64x1x1xf32>, memref<2xindex>) -> memref<2x64xf32> // CHECK-DAG: [[LOOP_0_:%.+]]:2 = krnl.define_loops 2 // CHECK: [[BLOCK_TILE__0_:%.+]], [[BLOCK_IN__0_:%.+]] = krnl.block [[LOOP_0_]]#1 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) -// CHECK: [[RES_3_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: krnl.iterate([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) with ([[LOOP_0_]]#0 -> [[I_0_:%.+]] = 0 to 2, [[LOOP_0_]]#1 -> [[I_1_:%.+]] = 0 to 64){ -// CHECK: [[VAR_8_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[VAR_8_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_0_]]#0, [[BLOCK_TILE__0_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[RES_3_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> +// CHECK-NOT: separator of consecutive DAGs // CHECK-DAG: [[VAR_9_:%.+]] = affine.apply [[MAP_0_]]([[VAR_8_]]#1) // CHECK-DAG: [[VAR_10_:%.+]] = affine.apply [[MAP_1_]]([[VAR_8_]]#1) // CHECK-DAG: [[VAR_11_:%.+]] = affine.apply [[MAP_2_]]([[VAR_8_]]#1) @@ -130,43 +131,43 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK-DAG: [[RES_4_:%.+]] = memref.alloc() {{.*}}: memref<2x64x1x1xf32> // CHECK-DAG: [[RES_5_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_5_]][0] : memref<1xindex> -// CHECK-DAG: [[VAR_reshape_8_:%.+]] = memref.reshape [[RES_]]([[RES_]]_7) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK-DAG: [[VAR_reshape_7_:%.+]] = memref.reshape [[RES_]]([[RES_]]_6) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK-DAG: [[RES_6_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_6_]][0] : memref<1xindex> -// CHECK-DAG: [[VAR_reshape_10_:%.+]] = memref.reshape [[RES_]]([[RES_]]_9) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK-DAG: [[VAR_reshape_9_:%.+]] = memref.reshape [[RES_]]([[RES_]]_8) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK-DAG: [[RES_7_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_7_]][0] : memref<1xindex> -// CHECK: [[VAR_reshape_12_:%.+]] = memref.reshape [[RES_4_]]([[RES_7_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK: [[VAR_reshape_11_:%.+]] = memref.reshape [[RES_4_]]([[RES_7_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK: krnl.iterate() with (){ // CHECK: [[LOOP_1_:%.+]] = krnl.define_loops 1 // CHECK: [[BLOCK_TILE__1_:%.+]], [[BLOCK_IN__1_:%.+]] = krnl.block [[LOOP_1_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) // CHECK: krnl.iterate([[BLOCK_TILE__1_]]) with ([[LOOP_1_]] -> [[I_3_:%.+]] = 0 to 128){ // CHECK: [[VAR_9_1_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__1_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[VAR_10_1_:%.+]] = vector.load [[VAR_reshape_8_]]{{.}}[[VAR_9_1_]]{{.}} : memref<128xf32>, vector<32xf32> -// CHECK-DAG: [[VAR_11_1_:%.+]] = vector.load [[VAR_reshape_10_]]{{.}}[[VAR_9_1_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK-DAG: [[VAR_10_1_:%.+]] = vector.load [[VAR_reshape_7_]]{{.}}[[VAR_9_1_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK-DAG: [[VAR_11_1_:%.+]] = vector.load [[VAR_reshape_9_]]{{.}}[[VAR_9_1_]]{{.}} : memref<128xf32>, vector<32xf32> // CHECK: [[LOAD_VAR_reshape_MEM_4_:%.+]] = arith.mulf [[VAR_10_1_]], [[VAR_11_1_]] : vector<32xf32> -// CHECK: vector.store [[LOAD_VAR_reshape_MEM_4_]], [[VAR_reshape_12_]]{{.}}[[VAR_9_1_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK: vector.store [[LOAD_VAR_reshape_MEM_4_]], [[VAR_reshape_11_]]{{.}}[[VAR_9_1_]]{{.}} : memref<128xf32>, vector<32xf32> // CHECK: } // CHECK: } // CHECK-DAG: [[RES_8_:%.+]] = memref.alloc() {{.*}}: memref<2x64x31x3xf32> // CHECK-DAG: [[RES_9_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_11904_]], [[RES_9_]][0] : memref<1xindex> -// CHECK-DAG: [[VAR_reshape_15_:%.+]] = memref.reshape [[PARAM_0_]]([[RES_9_]]) : (memref<2x64x31x3xf32>, memref<1xindex>) -> memref<11904xf32> +// CHECK-DAG: [[VAR_reshape_14_:%.+]] = memref.reshape [[PARAM_0_]]([[RES_9_]]) : (memref<2x64x31x3xf32>, memref<1xindex>) -> memref<11904xf32> // CHECK-DAG: [[RES_10_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_11904_]], [[RES_10_]][0] : memref<1xindex> -// CHECK-DAG: [[VAR_reshape_17_:%.+]] = memref.reshape [[PARAM_0_]]([[RES_10_]]) : (memref<2x64x31x3xf32>, memref<1xindex>) -> memref<11904xf32> +// CHECK-DAG: [[VAR_reshape_16_:%.+]] = memref.reshape [[PARAM_0_]]([[RES_10_]]) : (memref<2x64x31x3xf32>, memref<1xindex>) -> memref<11904xf32> // CHECK-DAG: [[RES_11_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_11904_]], [[RES_11_]][0] : memref<1xindex> -// CHECK: [[VAR_reshape_19_:%.+]] = memref.reshape [[RES_8_]]([[RES_11_]]) : (memref<2x64x31x3xf32>, memref<1xindex>) -> memref<11904xf32> +// CHECK: [[VAR_reshape_18_:%.+]] = memref.reshape [[RES_8_]]([[RES_11_]]) : (memref<2x64x31x3xf32>, memref<1xindex>) -> memref<11904xf32> // CHECK: krnl.iterate() with (){ // CHECK: [[LOOP_2_:%.+]] = krnl.define_loops 1 // CHECK: [[BLOCK_TILE__2_:%.+]], [[BLOCK_IN__2_:%.+]] = krnl.block [[LOOP_2_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) // CHECK: krnl.iterate([[BLOCK_TILE__2_]]) with ([[LOOP_2_]] -> [[I_4_:%.+]] = 0 to 11904){ // CHECK: [[VAR_9_2_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__2_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[VAR_10_1_:%.+]] = vector.load [[VAR_reshape_15_]]{{.}}[[VAR_9_2_]]{{.}} : memref<11904xf32>, vector<32xf32> -// CHECK-DAG: [[VAR_11_1_:%.+]] = vector.load [[VAR_reshape_17_]]{{.}}[[VAR_9_2_]]{{.}} : memref<11904xf32>, vector<32xf32> +// CHECK-DAG: [[VAR_10_1_:%.+]] = vector.load [[VAR_reshape_14_]]{{.}}[[VAR_9_2_]]{{.}} : memref<11904xf32>, vector<32xf32> +// CHECK-DAG: [[VAR_11_1_:%.+]] = vector.load [[VAR_reshape_16_]]{{.}}[[VAR_9_2_]]{{.}} : memref<11904xf32>, vector<32xf32> // CHECK: [[LOAD_VAR_reshape_MEM_4_1_:%.+]] = arith.mulf [[VAR_10_1_]], [[VAR_11_1_]] : vector<32xf32> -// CHECK: vector.store [[LOAD_VAR_reshape_MEM_4_1_]], [[VAR_reshape_19_]]{{.}}[[VAR_9_2_]]{{.}} : memref<11904xf32>, vector<32xf32> +// CHECK: vector.store [[LOAD_VAR_reshape_MEM_4_1_]], [[VAR_reshape_18_]]{{.}}[[VAR_9_2_]]{{.}} : memref<11904xf32>, vector<32xf32> // CHECK: } // CHECK: } // CHECK-DAG: [[RES_12_:%.+]] = memref.alloc() {{.*}}: memref<2x64x1x1xf32> @@ -174,16 +175,17 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK: affine.store [[CST_2_]], [[RES_13_]][0] : memref<3xindex> // CHECK: affine.store [[CST_64_]], [[RES_13_]][1] : memref<3xindex> // CHECK: affine.store [[CST_93_]], [[RES_13_]][2] : memref<3xindex> -// CHECK-DAG: [[VAR_reshape_22_:%.+]] = memref.reshape [[RES_8_]]([[RES_13_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> +// CHECK-DAG: [[VAR_reshape_21_:%.+]] = memref.reshape [[RES_8_]]([[RES_13_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> // CHECK-DAG: [[RES_14_:%.+]] = memref.alloc() {{.*}}: memref<2xindex> // CHECK: affine.store [[CST_2_]], [[RES_14_]][0] : memref<2xindex> // CHECK: affine.store [[CST_64_]], [[RES_14_]][1] : memref<2xindex> -// CHECK-DAG: [[VAR_reshape_24_:%.+]] = memref.reshape [[RES_12_]]([[RES_14_]]) : (memref<2x64x1x1xf32>, memref<2xindex>) -> memref<2x64xf32> +// CHECK-DAG: [[VAR_reshape_23_:%.+]] = memref.reshape [[RES_12_]]([[RES_14_]]) : (memref<2x64x1x1xf32>, memref<2xindex>) -> memref<2x64xf32> // CHECK-DAG: [[LOOP_3_:%.+]]:2 = krnl.define_loops 2 // CHECK: [[BLOCK_TILE__3_:%.+]], [[BLOCK_IN__3_:%.+]] = krnl.block [[LOOP_3_]]#1 4 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) -// CHECK: [[RES_15_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> // CHECK: krnl.iterate([[LOOP_3_]]#0, [[BLOCK_TILE__3_]]) with ([[LOOP_3_]]#0 -> [[I_5_:%.+]] = 0 to 2, [[LOOP_3_]]#1 -> [[I_6_:%.+]] = 0 to 64){ -// CHECK: [[VAR_8_1_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_3_]]#0, [[BLOCK_TILE__3_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[VAR_8_1_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_3_]]#0, [[BLOCK_TILE__3_]]) : (!krnl.loop, !krnl.loop) -> (index, index) +// CHECK-DAG: [[RES_15_:%.+]] = memref.alloc() {{.*}}: memref<4x4xf32> +// CHECK-NOT: separator of consecutive DAGs // CHECK-DAG: [[VAR_9_3_:%.+]] = affine.apply [[MAP_0_]]([[VAR_8_1_]]#1) // CHECK-DAG: [[VAR_10_2_:%.+]] = affine.apply [[MAP_1_]]([[VAR_8_1_]]#1) // CHECK-DAG: [[VAR_11_2_:%.+]] = affine.apply [[MAP_2_]]([[VAR_8_1_]]#1) @@ -192,10 +194,10 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK: vector.store [[VAR_cst_1_]], [[RES_15_]]{{.}}[[CST_2_]], [[CST_0_]]{{.}} : memref<4x4xf32>, vector<4xf32> // CHECK: vector.store [[VAR_cst_1_]], [[RES_15_]]{{.}}[[CST_3_]], [[CST_0_]]{{.}} : memref<4x4xf32>, vector<4xf32> // CHECK: scf.for [[I_7_:%.+]] = [[CST_0_]] to [[CST_90_]] step [[CST_4_]] { -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_8_:%.+]] = vector.load [[VAR_reshape_22_]]{{.}}[[VAR_8_1_]]#0, [[VAR_8_1_]]#1, [[I_7_]]{{.}} : memref<2x64x93xf32>, vector<4xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_1_:%.+]] = vector.load [[VAR_reshape_22_]]{{.}}[[VAR_8_1_]]#0, [[VAR_9_3_]], [[I_7_]]{{.}} : memref<2x64x93xf32>, vector<4xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_2_:%.+]] = vector.load [[VAR_reshape_22_]]{{.}}[[VAR_8_1_]]#0, [[VAR_10_2_]], [[I_7_]]{{.}} : memref<2x64x93xf32>, vector<4xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_3_:%.+]] = vector.load [[VAR_reshape_22_]]{{.}}[[VAR_8_1_]]#0, [[VAR_11_2_]], [[I_7_]]{{.}} : memref<2x64x93xf32>, vector<4xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_8_:%.+]] = vector.load [[VAR_reshape_21_]]{{.}}[[VAR_8_1_]]#0, [[VAR_8_1_]]#1, [[I_7_]]{{.}} : memref<2x64x93xf32>, vector<4xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_1_:%.+]] = vector.load [[VAR_reshape_21_]]{{.}}[[VAR_8_1_]]#0, [[VAR_9_3_]], [[I_7_]]{{.}} : memref<2x64x93xf32>, vector<4xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_2_:%.+]] = vector.load [[VAR_reshape_21_]]{{.}}[[VAR_8_1_]]#0, [[VAR_10_2_]], [[I_7_]]{{.}} : memref<2x64x93xf32>, vector<4xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_3_:%.+]] = vector.load [[VAR_reshape_21_]]{{.}}[[VAR_8_1_]]#0, [[VAR_11_2_]], [[I_7_]]{{.}} : memref<2x64x93xf32>, vector<4xf32> // CHECK-DAG: [[LOAD_RES_3_MEM_12_:%.+]] = vector.load [[RES_15_]]{{.}}[[CST_0_]], [[CST_0_]]{{.}} : memref<4x4xf32>, vector<4xf32> // CHECK-DAG: [[LOAD_RES_3_MEM_1_:%.+]] = vector.load [[RES_15_]]{{.}}[[CST_1_]], [[CST_0_]]{{.}} : memref<4x4xf32>, vector<4xf32> // CHECK-DAG: [[LOAD_RES_3_MEM_2_:%.+]] = vector.load [[RES_15_]]{{.}}[[CST_2_]], [[CST_0_]]{{.}} : memref<4x4xf32>, vector<4xf32> @@ -210,10 +212,10 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK: vector.store [[VAR_48_1_]], [[RES_15_]]{{.}}[[CST_2_]], [[CST_0_]]{{.}} : memref<4x4xf32>, vector<4xf32> // CHECK: vector.store [[VAR_49_1_]], [[RES_15_]]{{.}}[[CST_3_]], [[CST_0_]]{{.}} : memref<4x4xf32>, vector<4xf32> // CHECK: } -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_:%.+]] = memref.load [[VAR_reshape_22_]]{{.}}[[VAR_8_1_]]#0, [[VAR_8_1_]]#1, [[CST_92_]]{{.}} : memref<2x64x93xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_:%.+]] = memref.load [[VAR_reshape_22_]]{{.}}[[VAR_8_1_]]#0, [[VAR_9_3_]], [[CST_92_]]{{.}} : memref<2x64x93xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_6_:%.+]] = memref.load [[VAR_reshape_22_]]{{.}}[[VAR_8_1_]]#0, [[VAR_10_2_]], [[CST_92_]]{{.}} : memref<2x64x93xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_7_:%.+]] = memref.load [[VAR_reshape_22_]]{{.}}[[VAR_8_1_]]#0, [[VAR_11_2_]], [[CST_92_]]{{.}} : memref<2x64x93xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_:%.+]] = memref.load [[VAR_reshape_21_]]{{.}}[[VAR_8_1_]]#0, [[VAR_8_1_]]#1, [[CST_92_]]{{.}} : memref<2x64x93xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_:%.+]] = memref.load [[VAR_reshape_21_]]{{.}}[[VAR_8_1_]]#0, [[VAR_9_3_]], [[CST_92_]]{{.}} : memref<2x64x93xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_6_:%.+]] = memref.load [[VAR_reshape_21_]]{{.}}[[VAR_8_1_]]#0, [[VAR_10_2_]], [[CST_92_]]{{.}} : memref<2x64x93xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_7_:%.+]] = memref.load [[VAR_reshape_21_]]{{.}}[[VAR_8_1_]]#0, [[VAR_11_2_]], [[CST_92_]]{{.}} : memref<2x64x93xf32> // CHECK-DAG: [[LOAD_RES_3_MEM_4_:%.+]] = memref.load [[RES_15_]]{{.}}[[CST_0_]], [[CST_0_]]{{.}} : memref<4x4xf32> // CHECK-DAG: [[LOAD_RES_3_MEM_5_:%.+]] = memref.load [[RES_15_]]{{.}}[[CST_1_]], [[CST_0_]]{{.}} : memref<4x4xf32> // CHECK-DAG: [[LOAD_RES_3_MEM_6_:%.+]] = memref.load [[RES_15_]]{{.}}[[CST_2_]], [[CST_0_]]{{.}} : memref<4x4xf32> @@ -243,27 +245,27 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK-DAG: [[VAR_35_1_:%.+]] = vector.shuffle [[VAR_30_1_]], [[VAR_33_1_]] [2, 3, 6, 7] : vector<4xf32>, vector<4xf32> // CHECK: [[VAR_36_1_:%.+]] = arith.addf [[VAR_35_1_]], [[VAR_34_1_]] : vector<4xf32> // CHECK: [[VAR_37_1_:%.+]] = arith.divf [[VAR_36_1_]], [[VAR_cst_0_]] : vector<4xf32> -// CHECK: vector.store [[VAR_37_1_]], [[VAR_reshape_24_]]{{.}}[[VAR_8_1_]]#0, [[VAR_8_1_]]#1] : memref<2x64xf32>, vector<4xf32> +// CHECK: vector.store [[VAR_37_1_]], [[VAR_reshape_23_]]{{.}}[[VAR_8_1_]]#0, [[VAR_8_1_]]#1] : memref<2x64xf32>, vector<4xf32> // CHECK: } // CHECK-DAG: [[RES_16_:%.+]] = memref.alloc() {{.*}}: memref<2x64x1x1xf32> // CHECK-DAG: [[RES_17_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_17_]][0] : memref<1xindex> -// CHECK-DAG: [[VAR_reshape_30_:%.+]] = memref.reshape [[RES_12_]]([[RES_17_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK-DAG: [[VAR_reshape_28_:%.+]] = memref.reshape [[RES_12_]]([[RES_17_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK-DAG: [[RES_18_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_18_]][0] : memref<1xindex> -// CHECK-DAG: [[VAR_reshape_32_:%.+]] = memref.reshape [[RES_4_]]([[RES_18_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK-DAG: [[VAR_reshape_30_:%.+]] = memref.reshape [[RES_4_]]([[RES_18_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK-DAG: [[RES_19_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_19_]][0] : memref<1xindex> -// CHECK: [[VAR_reshape_34_:%.+]] = memref.reshape [[RES_16_]]([[RES_19_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK: [[VAR_reshape_32_:%.+]] = memref.reshape [[RES_16_]]([[RES_19_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK: krnl.iterate() with (){ // CHECK: [[LOOP_4_:%.+]] = krnl.define_loops 1 // CHECK: [[BLOCK_TILE__4_:%.+]], [[BLOCK_IN__4_:%.+]] = krnl.block [[LOOP_4_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) // CHECK: krnl.iterate([[BLOCK_TILE__4_]]) with ([[LOOP_4_]] -> [[I_8_:%.+]] = 0 to 128){ // CHECK: [[VAR_9_4_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__4_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[VAR_10_2_:%.+]] = vector.load [[VAR_reshape_30_]]{{.}}[[VAR_9_4_]]{{.}} : memref<128xf32>, vector<32xf32> -// CHECK-DAG: [[VAR_11_2_:%.+]] = vector.load [[VAR_reshape_32_]]{{.}}[[VAR_9_4_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK-DAG: [[VAR_10_2_:%.+]] = vector.load [[VAR_reshape_28_]]{{.}}[[VAR_9_4_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK-DAG: [[VAR_11_2_:%.+]] = vector.load [[VAR_reshape_30_]]{{.}}[[VAR_9_4_]]{{.}} : memref<128xf32>, vector<32xf32> // CHECK: [[LOAD_VAR_reshape_MEM_4_1_1_:%.+]] = arith.subf [[VAR_10_2_]], [[VAR_11_2_]] : vector<32xf32> -// CHECK: vector.store [[LOAD_VAR_reshape_MEM_4_1_1_]], [[VAR_reshape_34_]]{{.}}[[VAR_9_4_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK: vector.store [[LOAD_VAR_reshape_MEM_4_1_1_]], [[VAR_reshape_32_]]{{.}}[[VAR_9_4_]]{{.}} : memref<128xf32>, vector<32xf32> // CHECK: } // CHECK: } // CHECK-DAG: [[RES_20_:%.+]] = memref.alloc() {{.*}}: memref<2x64x31x3xf32> @@ -271,17 +273,17 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK: affine.store [[CST_2_]], [[RES_21_]][0] : memref<3xindex> // CHECK: affine.store [[CST_64_]], [[RES_21_]][1] : memref<3xindex> // CHECK: affine.store [[CST_93_]], [[RES_21_]][2] : memref<3xindex> -// CHECK-DAG: [[VAR_reshape_37_:%.+]] = memref.reshape [[PARAM_0_]]([[RES_21_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> +// CHECK-DAG: [[VAR_reshape_35_:%.+]] = memref.reshape [[PARAM_0_]]([[RES_21_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> // CHECK-DAG: [[RES_22_:%.+]] = memref.alloc() {{.*}}: memref<3xindex> // CHECK: affine.store [[CST_2_]], [[RES_22_]][0] : memref<3xindex> // CHECK: affine.store [[CST_64_]], [[RES_22_]][1] : memref<3xindex> // CHECK: affine.store [[CST_1_]], [[RES_22_]][2] : memref<3xindex> -// CHECK-DAG: [[VAR_reshape_39_:%.+]] = memref.reshape [[RES_]]([[RES_]]_38) : (memref<2x64x1x1xf32>, memref<3xindex>) -> memref<2x64x1xf32> +// CHECK-DAG: [[VAR_reshape_37_:%.+]] = memref.reshape [[RES_]]([[RES_]]_36) : (memref<2x64x1x1xf32>, memref<3xindex>) -> memref<2x64x1xf32> // CHECK-DAG: [[RES_23_:%.+]] = memref.alloc() {{.*}}: memref<3xindex> // CHECK: affine.store [[CST_2_]], [[RES_23_]][0] : memref<3xindex> // CHECK: affine.store [[CST_64_]], [[RES_23_]][1] : memref<3xindex> // CHECK: affine.store [[CST_93_]], [[RES_23_]][2] : memref<3xindex> -// CHECK-DAG: [[VAR_reshape_41_:%.+]] = memref.reshape [[RES_20_]]([[RES_23_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> +// CHECK-DAG: [[VAR_reshape_39_:%.+]] = memref.reshape [[RES_20_]]([[RES_23_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> // CHECK-DAG: [[LOOP_5_:%.+]]:2 = krnl.define_loops 2 // CHECK: krnl.iterate([[LOOP_5_]]#0, [[LOOP_5_]]#1) with ([[LOOP_5_]]#0 -> [[I_9_:%.+]] = 0 to 2, [[LOOP_5_]]#1 -> [[I_10_:%.+]] = 0 to 64){ // CHECK-DAG: [[VAR_8_2_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_5_]]#0, [[LOOP_5_]]#1) : (!krnl.loop, !krnl.loop) -> (index, index) @@ -289,72 +291,72 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK: [[BLOCK_TILE__5_:%.+]], [[BLOCK_IN__5_:%.+]] = krnl.block [[LOOP_6_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) // CHECK: krnl.iterate([[BLOCK_TILE__5_]]) with ([[LOOP_6_]] -> [[I_11_:%.+]] = 0 to 62){ // CHECK: [[VAR_11_3_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__5_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_:%.+]] = vector.load [[VAR_reshape_37_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[VAR_11_3_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_:%.+]] = krnl.load [[VAR_reshape_39_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[CST_0_]]{{.}} : memref<2x64x1xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_:%.+]] = vector.load [[VAR_reshape_35_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[VAR_11_3_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_:%.+]] = krnl.load [[VAR_reshape_37_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[CST_0_]]{{.}} : memref<2x64x1xf32> // CHECK: [[LOAD_VAR_reshape_MEM_6_1_:%.+]] = vector.splat [[LOAD_VAR_reshape_MEM_5_1_]] : vector<32xf32> // CHECK: [[LOAD_VAR_reshape_MEM_7_1_:%.+]] = arith.subf [[LOAD_VAR_reshape_MEM_4_1_1_]], [[LOAD_VAR_reshape_MEM_6_1_]] : vector<32xf32> -// CHECK: vector.store [[LOAD_VAR_reshape_MEM_7_1_]], [[VAR_reshape_41_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[VAR_11_3_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> +// CHECK: vector.store [[LOAD_VAR_reshape_MEM_7_1_]], [[VAR_reshape_39_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[VAR_11_3_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> // CHECK: } // CHECK: [[LOOP_7_:%.+]] = krnl.define_loops 1 // CHECK: krnl.iterate([[LOOP_7_]]) with ([[LOOP_7_]] -> [[I_12_:%.+]] = 64 to 93){ // CHECK: [[VAR_11_4_:%.+]] = krnl.get_induction_var_value([[LOOP_7_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_:%.+]] = krnl.load [[VAR_reshape_37_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[VAR_11_4_]]{{.}} : memref<2x64x93xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_:%.+]] = krnl.load [[VAR_reshape_39_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[CST_0_]]{{.}} : memref<2x64x1xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_:%.+]] = krnl.load [[VAR_reshape_35_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[VAR_11_4_]]{{.}} : memref<2x64x93xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_:%.+]] = krnl.load [[VAR_reshape_37_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[CST_0_]]{{.}} : memref<2x64x1xf32> // CHECK: [[LOAD_VAR_reshape_MEM_6_1_:%.+]] = arith.subf [[LOAD_VAR_reshape_MEM_4_1_1_1_]], [[LOAD_VAR_reshape_MEM_5_1_]] : f32 -// CHECK: krnl.store [[LOAD_VAR_reshape_MEM_6_1_]], [[VAR_reshape_41_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[VAR_11_4_]]{{.}} : memref<2x64x93xf32> +// CHECK: krnl.store [[LOAD_VAR_reshape_MEM_6_1_]], [[VAR_reshape_39_]]{{.}}[[VAR_8_2_]]#0, [[VAR_8_2_]]#1, [[VAR_11_4_]]{{.}} : memref<2x64x93xf32> // CHECK: } // CHECK: } // CHECK-DAG: [[RES_24_:%.+]] = memref.alloc() {{.*}}: memref<2x64x1x1xf32> // CHECK-DAG: [[RES_25_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_25_]][0] : memref<1xindex> -// CHECK-DAG: [[VAR_reshape_44_:%.+]] = memref.reshape [[RES_16_]]([[RES_25_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK-DAG: [[VAR_reshape_42_:%.+]] = memref.reshape [[RES_16_]]([[RES_25_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK-DAG: [[RES_26_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_26_]][0] : memref<1xindex> -// CHECK: [[VAR_reshape_46_:%.+]] = memref.reshape [[RES_24_]]([[RES_26_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK: [[VAR_reshape_44_:%.+]] = memref.reshape [[RES_24_]]([[RES_26_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK: krnl.iterate() with (){ // CHECK: [[LOOP_8_:%.+]] = krnl.define_loops 1 // CHECK: [[BLOCK_TILE__6_:%.+]], [[BLOCK_IN__6_:%.+]] = krnl.block [[LOOP_8_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) // CHECK: krnl.iterate([[BLOCK_TILE__6_]]) with ([[LOOP_8_]] -> [[I_13_:%.+]] = 0 to 128){ // CHECK: [[VAR_9_5_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__6_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[LOOP_7_:%.+]] = vector.load [[VAR_reshape_44_]]{{.}}[[VAR_9_5_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK-DAG: [[LOOP_7_:%.+]] = vector.load [[VAR_reshape_42_]]{{.}}[[VAR_9_5_]]{{.}} : memref<128xf32>, vector<32xf32> // CHECK-DAG: [[VAR_11_4_:%.+]] = krnl.load [[VAR_0_]]{{.}}[[CST_0_]]{{.}} : memref<1xf32> // CHECK: [[LOAD_VAR_reshape_MEM_4_1_1_1_:%.+]] = vector.splat [[VAR_11_4_]] : vector<32xf32> // CHECK: [[LOAD_VAR_reshape_MEM_5_1_1_:%.+]] = arith.addf [[LOOP_7_]], [[LOAD_VAR_reshape_MEM_4_1_1_1_]] : vector<32xf32> -// CHECK: vector.store [[LOAD_VAR_reshape_MEM_5_1_1_]], [[VAR_reshape_46_]]{{.}}[[VAR_9_5_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK: vector.store [[LOAD_VAR_reshape_MEM_5_1_1_]], [[VAR_reshape_44_]]{{.}}[[VAR_9_5_]]{{.}} : memref<128xf32>, vector<32xf32> // CHECK: } // CHECK: } // CHECK-DAG: [[RES_27_:%.+]] = memref.alloc() {{.*}}: memref<2x64x1x1xf32> // CHECK-DAG: [[RES_28_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_28_]][0] : memref<1xindex> -// CHECK-DAG: [[VAR_reshape_49_:%.+]] = memref.reshape [[RES_24_]]([[RES_28_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK-DAG: [[VAR_reshape_47_:%.+]] = memref.reshape [[RES_24_]]([[RES_28_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK-DAG: [[RES_29_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_29_]][0] : memref<1xindex> -// CHECK: [[VAR_reshape_51_:%.+]] = memref.reshape [[RES_27_]]([[RES_29_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK: [[VAR_reshape_49_:%.+]] = memref.reshape [[RES_27_]]([[RES_29_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK: krnl.iterate() with (){ // CHECK: [[LOOP_9_:%.+]] = krnl.define_loops 1 // CHECK: [[BLOCK_TILE__7_:%.+]], [[BLOCK_IN__7_:%.+]] = krnl.block [[LOOP_9_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) // CHECK: krnl.iterate([[BLOCK_TILE__7_]]) with ([[LOOP_9_]] -> [[I_14_:%.+]] = 0 to 128){ // CHECK: [[VAR_9_6_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__7_]]) : (!krnl.loop) -> index -// CHECK: [[LOOP_7_1_:%.+]] = vector.load [[VAR_reshape_49_]]{{.}}[[VAR_9_6_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK: [[LOOP_7_1_:%.+]] = vector.load [[VAR_reshape_47_]]{{.}}[[VAR_9_6_]]{{.}} : memref<128xf32>, vector<32xf32> // CHECK: [[VAR_11_5_:%.+]] = math.sqrt [[LOOP_7_1_]] : vector<32xf32> -// CHECK: vector.store [[VAR_11_5_]], [[VAR_reshape_51_]]{{.}}[[VAR_9_6_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK: vector.store [[VAR_11_5_]], [[VAR_reshape_49_]]{{.}}[[VAR_9_6_]]{{.}} : memref<128xf32>, vector<32xf32> // CHECK: } // CHECK: } // CHECK-DAG: [[RES_30_:%.+]] = memref.alloc() {{.*}}: memref<2x64x1x1xf32> // CHECK-DAG: [[RES_31_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_31_]][0] : memref<1xindex> -// CHECK-DAG: [[VAR_reshape_54_:%.+]] = memref.reshape [[RES_27_]]([[RES_31_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK-DAG: [[VAR_reshape_52_:%.+]] = memref.reshape [[RES_27_]]([[RES_31_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK-DAG: [[RES_32_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_128_]], [[RES_32_]][0] : memref<1xindex> -// CHECK: [[VAR_reshape_56_:%.+]] = memref.reshape [[RES_30_]]([[RES_32_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> +// CHECK: [[VAR_reshape_54_:%.+]] = memref.reshape [[RES_30_]]([[RES_32_]]) : (memref<2x64x1x1xf32>, memref<1xindex>) -> memref<128xf32> // CHECK: krnl.iterate() with (){ // CHECK: [[LOOP_10_:%.+]] = krnl.define_loops 1 // CHECK: [[BLOCK_TILE__8_:%.+]], [[BLOCK_IN__8_:%.+]] = krnl.block [[LOOP_10_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) // CHECK: krnl.iterate([[BLOCK_TILE__8_]]) with ([[LOOP_10_]] -> [[I_15_:%.+]] = 0 to 128){ // CHECK: [[VAR_9_7_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__8_]]) : (!krnl.loop) -> index -// CHECK: [[LOOP_7_1_:%.+]] = vector.load [[VAR_reshape_54_]]{{.}}[[VAR_9_7_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK: [[LOOP_7_1_:%.+]] = vector.load [[VAR_reshape_52_]]{{.}}[[VAR_9_7_]]{{.}} : memref<128xf32>, vector<32xf32> // CHECK: [[VAR_11_6_:%.+]] = arith.divf [[VAR_cst_]], [[LOOP_7_1_]] : vector<32xf32> -// CHECK: vector.store [[VAR_11_6_]], [[VAR_reshape_56_]]{{.}}[[VAR_9_7_]]{{.}} : memref<128xf32>, vector<32xf32> +// CHECK: vector.store [[VAR_11_6_]], [[VAR_reshape_54_]]{{.}}[[VAR_9_7_]]{{.}} : memref<128xf32>, vector<32xf32> // CHECK: } // CHECK: } // CHECK-DAG: [[RES_33_:%.+]] = memref.alloc() {{.*}}: memref<2x64x31x3xf32> @@ -362,17 +364,17 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK: affine.store [[CST_2_]], [[RES_34_]][0] : memref<3xindex> // CHECK: affine.store [[CST_64_]], [[RES_34_]][1] : memref<3xindex> // CHECK: affine.store [[CST_93_]], [[RES_34_]][2] : memref<3xindex> -// CHECK-DAG: [[VAR_reshape_59_:%.+]] = memref.reshape [[RES_20_]]([[RES_34_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> +// CHECK-DAG: [[VAR_reshape_57_:%.+]] = memref.reshape [[RES_20_]]([[RES_34_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> // CHECK-DAG: [[RES_35_:%.+]] = memref.alloc() {{.*}}: memref<3xindex> // CHECK: affine.store [[CST_2_]], [[RES_35_]][0] : memref<3xindex> // CHECK: affine.store [[CST_64_]], [[RES_35_]][1] : memref<3xindex> // CHECK: affine.store [[CST_1_]], [[RES_35_]][2] : memref<3xindex> -// CHECK-DAG: [[VAR_reshape_61_:%.+]] = memref.reshape [[RES_30_]]([[RES_35_]]) : (memref<2x64x1x1xf32>, memref<3xindex>) -> memref<2x64x1xf32> +// CHECK-DAG: [[VAR_reshape_59_:%.+]] = memref.reshape [[RES_30_]]([[RES_35_]]) : (memref<2x64x1x1xf32>, memref<3xindex>) -> memref<2x64x1xf32> // CHECK-DAG: [[RES_36_:%.+]] = memref.alloc() {{.*}}: memref<3xindex> // CHECK: affine.store [[CST_2_]], [[RES_36_]][0] : memref<3xindex> // CHECK: affine.store [[CST_64_]], [[RES_36_]][1] : memref<3xindex> // CHECK: affine.store [[CST_93_]], [[RES_36_]][2] : memref<3xindex> -// CHECK-DAG: [[VAR_reshape_63_:%.+]] = memref.reshape [[RES_33_]]([[RES_36_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> +// CHECK-DAG: [[VAR_reshape_61_:%.+]] = memref.reshape [[RES_33_]]([[RES_36_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> // CHECK-DAG: [[LOOP_11_:%.+]]:2 = krnl.define_loops 2 // CHECK: krnl.iterate([[LOOP_11_]]#0, [[LOOP_11_]]#1) with ([[LOOP_11_]]#0 -> [[I_16_:%.+]] = 0 to 2, [[LOOP_11_]]#1 -> [[I_17_:%.+]] = 0 to 64){ // CHECK-DAG: [[VAR_8_3_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_11_]]#0, [[LOOP_11_]]#1) : (!krnl.loop, !krnl.loop) -> (index, index) @@ -380,19 +382,19 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK: [[BLOCK_TILE__9_:%.+]], [[BLOCK_IN__9_:%.+]] = krnl.block [[LOOP_12_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) // CHECK: krnl.iterate([[BLOCK_TILE__9_]]) with ([[LOOP_12_]] -> [[I_18_:%.+]] = 0 to 62){ // CHECK: [[VAR_11_7_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__9_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_:%.+]] = vector.load [[VAR_reshape_59_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[VAR_11_7_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_:%.+]] = krnl.load [[VAR_reshape_61_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[CST_0_]]{{.}} : memref<2x64x1xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_:%.+]] = vector.load [[VAR_reshape_57_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[VAR_11_7_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_:%.+]] = krnl.load [[VAR_reshape_59_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[CST_0_]]{{.}} : memref<2x64x1xf32> // CHECK: [[LOAD_VAR_reshape_MEM_6_1_1_:%.+]] = vector.splat [[LOAD_VAR_reshape_MEM_5_1_1_]] : vector<32xf32> // CHECK: [[LOAD_VAR_reshape_MEM_7_1_:%.+]] = arith.mulf [[LOAD_VAR_reshape_MEM_4_1_1_1_1_]], [[LOAD_VAR_reshape_MEM_6_1_1_]] : vector<32xf32> -// CHECK: vector.store [[LOAD_VAR_reshape_MEM_7_1_]], [[VAR_reshape_63_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[VAR_11_7_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> +// CHECK: vector.store [[LOAD_VAR_reshape_MEM_7_1_]], [[VAR_reshape_61_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[VAR_11_7_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> // CHECK: } // CHECK: [[LOOP_13_:%.+]] = krnl.define_loops 1 // CHECK: krnl.iterate([[LOOP_13_]]) with ([[LOOP_13_]] -> [[I_19_:%.+]] = 64 to 93){ // CHECK: [[VAR_11_8_:%.+]] = krnl.get_induction_var_value([[LOOP_13_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_:%.+]] = krnl.load [[VAR_reshape_59_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[VAR_11_8_]]{{.}} : memref<2x64x93xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_1_:%.+]] = krnl.load [[VAR_reshape_61_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[CST_0_]]{{.}} : memref<2x64x1xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_:%.+]] = krnl.load [[VAR_reshape_57_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[VAR_11_8_]]{{.}} : memref<2x64x93xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_1_:%.+]] = krnl.load [[VAR_reshape_59_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[CST_0_]]{{.}} : memref<2x64x1xf32> // CHECK: [[LOAD_VAR_reshape_MEM_6_1_1_:%.+]] = arith.mulf [[LOAD_VAR_reshape_MEM_4_1_1_1_1_]], [[LOAD_VAR_reshape_MEM_5_1_1_1_]] : f32 -// CHECK: krnl.store [[LOAD_VAR_reshape_MEM_6_1_1_]], [[VAR_reshape_63_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[VAR_11_8_]]{{.}} : memref<2x64x93xf32> +// CHECK: krnl.store [[LOAD_VAR_reshape_MEM_6_1_1_]], [[VAR_reshape_61_]]{{.}}[[VAR_8_3_]]#0, [[VAR_8_3_]]#1, [[VAR_11_8_]]{{.}} : memref<2x64x93xf32> // CHECK: } // CHECK: } // CHECK-DAG: [[RES_37_:%.+]] = memref.alloc() {{.*}}: memref<2x64x31x3xf32> @@ -400,15 +402,15 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK: affine.store [[CST_2_]], [[RES_38_]][0] : memref<3xindex> // CHECK: affine.store [[CST_64_]], [[RES_38_]][1] : memref<3xindex> // CHECK: affine.store [[CST_93_]], [[RES_38_]][2] : memref<3xindex> -// CHECK-DAG: [[VAR_reshape_66_:%.+]] = memref.reshape [[RES_33_]]([[RES_38_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> +// CHECK-DAG: [[VAR_reshape_64_:%.+]] = memref.reshape [[RES_33_]]([[RES_38_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> // CHECK-DAG: [[RES_39_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_93_]], [[RES_39_]][0] : memref<1xindex> -// CHECK-DAG: [[VAR_reshape_68_:%.+]] = memref.reshape [[PARAM_1_]]([[RES_39_]]) : (memref<31x3xf32>, memref<1xindex>) -> memref<93xf32> +// CHECK-DAG: [[VAR_reshape_66_:%.+]] = memref.reshape [[PARAM_1_]]([[RES_39_]]) : (memref<31x3xf32>, memref<1xindex>) -> memref<93xf32> // CHECK-DAG: [[RES_40_:%.+]] = memref.alloc() {{.*}}: memref<3xindex> // CHECK: affine.store [[CST_2_]], [[RES_40_]][0] : memref<3xindex> // CHECK: affine.store [[CST_64_]], [[RES_40_]][1] : memref<3xindex> // CHECK: affine.store [[CST_93_]], [[RES_40_]][2] : memref<3xindex> -// CHECK-DAG: [[VAR_reshape_70_:%.+]] = memref.reshape [[RES_37_]]([[RES_40_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> +// CHECK-DAG: [[VAR_reshape_68_:%.+]] = memref.reshape [[RES_37_]]([[RES_40_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> // CHECK-DAG: [[LOOP_14_:%.+]]:2 = krnl.define_loops 2 // CHECK: krnl.iterate([[LOOP_14_]]#0, [[LOOP_14_]]#1) with ([[LOOP_14_]]#0 -> [[I_20_:%.+]] = 0 to 2, [[LOOP_14_]]#1 -> [[I_21_:%.+]] = 0 to 64){ // CHECK-DAG: [[VAR_8_4_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_14_]]#0, [[LOOP_14_]]#1) : (!krnl.loop, !krnl.loop) -> (index, index) @@ -416,18 +418,18 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK: [[BLOCK_TILE__10_:%.+]], [[BLOCK_IN__10_:%.+]] = krnl.block [[LOOP_15_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) // CHECK: krnl.iterate([[BLOCK_TILE__10_]]) with ([[LOOP_15_]] -> [[I_22_:%.+]] = 0 to 62){ // CHECK: [[VAR_11_9_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__10_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_:%.+]] = vector.load [[VAR_reshape_66_]]{{.}}[[VAR_8_4_]]#0, [[VAR_8_4_]]#1, [[VAR_11_9_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_1_:%.+]] = vector.load [[VAR_reshape_68_]]{{.}}[[VAR_11_9_]]{{.}} : memref<93xf32>, vector<32xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_:%.+]] = vector.load [[VAR_reshape_64_]]{{.}}[[VAR_8_4_]]#0, [[VAR_8_4_]]#1, [[VAR_11_9_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_1_:%.+]] = vector.load [[VAR_reshape_66_]]{{.}}[[VAR_11_9_]]{{.}} : memref<93xf32>, vector<32xf32> // CHECK: [[LOAD_VAR_reshape_MEM_6_1_1_1_:%.+]] = arith.mulf [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_]], [[LOAD_VAR_reshape_MEM_5_1_1_1_]] : vector<32xf32> -// CHECK: vector.store [[LOAD_VAR_reshape_MEM_6_1_1_1_]], [[VAR_reshape_70_]]{{.}}[[VAR_8_4_]]#0, [[VAR_8_4_]]#1, [[VAR_11_9_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> +// CHECK: vector.store [[LOAD_VAR_reshape_MEM_6_1_1_1_]], [[VAR_reshape_68_]]{{.}}[[VAR_8_4_]]#0, [[VAR_8_4_]]#1, [[VAR_11_9_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> // CHECK: } // CHECK: [[LOOP_16_:%.+]] = krnl.define_loops 1 // CHECK: krnl.iterate([[LOOP_16_]]) with ([[LOOP_16_]] -> [[I_23_:%.+]] = 64 to 93){ // CHECK: [[VAR_11_10_:%.+]] = krnl.get_induction_var_value([[LOOP_16_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_:%.+]] = krnl.load [[VAR_reshape_66_]]{{.}}[[VAR_8_4_]]#0, [[VAR_8_4_]]#1, [[VAR_11_10_]]{{.}} : memref<2x64x93xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_1_1_:%.+]] = krnl.load [[VAR_reshape_68_]]{{.}}[[VAR_11_10_]]{{.}} : memref<93xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_:%.+]] = krnl.load [[VAR_reshape_64_]]{{.}}[[VAR_8_4_]]#0, [[VAR_8_4_]]#1, [[VAR_11_10_]]{{.}} : memref<2x64x93xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_1_1_:%.+]] = krnl.load [[VAR_reshape_66_]]{{.}}[[VAR_11_10_]]{{.}} : memref<93xf32> // CHECK: [[LOAD_VAR_reshape_MEM_6_1_1_1_:%.+]] = arith.mulf [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_]], [[LOAD_VAR_reshape_MEM_5_1_1_1_1_]] : f32 -// CHECK: krnl.store [[LOAD_VAR_reshape_MEM_6_1_1_1_]], [[VAR_reshape_70_]]{{.}}[[VAR_8_4_]]#0, [[VAR_8_4_]]#1, [[VAR_11_10_]]{{.}} : memref<2x64x93xf32> +// CHECK: krnl.store [[LOAD_VAR_reshape_MEM_6_1_1_1_]], [[VAR_reshape_68_]]{{.}}[[VAR_8_4_]]#0, [[VAR_8_4_]]#1, [[VAR_11_10_]]{{.}} : memref<2x64x93xf32> // CHECK: } // CHECK: } // CHECK: [[RES_41_:%.+]] = memref.alloc() {{.*}}: memref<2x64x31x3xf32> @@ -436,15 +438,15 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK: affine.store [[CST_2_]], [[RES_42_]][0] : memref<3xindex> // CHECK: affine.store [[CST_64_]], [[RES_42_]][1] : memref<3xindex> // CHECK: affine.store [[CST_93_]], [[RES_42_]][2] : memref<3xindex> -// CHECK-DAG: [[VAR_reshape_73_:%.+]] = memref.reshape [[RES_37_]]([[RES_42_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> +// CHECK-DAG: [[VAR_reshape_71_:%.+]] = memref.reshape [[RES_37_]]([[RES_42_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> // CHECK-DAG: [[RES_43_:%.+]] = memref.alloc() {{.*}}: memref<1xindex> // CHECK: affine.store [[CST_93_]], [[RES_43_]][0] : memref<1xindex> -// CHECK-DAG: [[VAR_reshape_75_:%.+]] = memref.reshape [[PARAM_2_]]([[RES_43_]]) : (memref<31x3xf32>, memref<1xindex>) -> memref<93xf32> +// CHECK-DAG: [[VAR_reshape_73_:%.+]] = memref.reshape [[PARAM_2_]]([[RES_43_]]) : (memref<31x3xf32>, memref<1xindex>) -> memref<93xf32> // CHECK-DAG: [[RES_44_:%.+]] = memref.alloc() {{.*}}: memref<3xindex> // CHECK: affine.store [[CST_2_]], [[RES_44_]][0] : memref<3xindex> // CHECK: affine.store [[CST_64_]], [[RES_44_]][1] : memref<3xindex> // CHECK: affine.store [[CST_93_]], [[RES_44_]][2] : memref<3xindex> -// CHECK-DAG: [[VAR_reshape_77_:%.+]] = memref.reshape [[RES_41_]]([[RES_44_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> +// CHECK-DAG: [[VAR_reshape_75_:%.+]] = memref.reshape [[RES_41_]]([[RES_44_]]) : (memref<2x64x31x3xf32>, memref<3xindex>) -> memref<2x64x93xf32> // CHECK-DAG: [[LOOP_17_:%.+]]:2 = krnl.define_loops 2 // CHECK: krnl.iterate([[LOOP_17_]]#0, [[LOOP_17_]]#1) with ([[LOOP_17_]]#0 -> [[I_24_:%.+]] = 0 to 2, [[LOOP_17_]]#1 -> [[I_25_:%.+]] = 0 to 64){ // CHECK-DAG: [[VAR_8_5_:%.+]]:2 = krnl.get_induction_var_value([[LOOP_17_]]#0, [[LOOP_17_]]#1) : (!krnl.loop, !krnl.loop) -> (index, index) @@ -452,18 +454,18 @@ func.func @layernorm_4D_with_scale_bias_no_SIMD(%arg0: tensor<2x64x31x3xf32>, %a // CHECK: [[BLOCK_TILE__11_:%.+]], [[BLOCK_IN__11_:%.+]] = krnl.block [[LOOP_18_]] 32 : (!krnl.loop) -> (!krnl.loop, !krnl.loop) // CHECK: krnl.iterate([[BLOCK_TILE__11_]]) with ([[LOOP_18_]] -> [[I_26_:%.+]] = 0 to 62){ // CHECK: [[VAR_11_11_:%.+]] = krnl.get_induction_var_value([[BLOCK_TILE__11_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_1_:%.+]] = vector.load [[VAR_reshape_73_]]{{.}}[[VAR_8_5_]]#0, [[VAR_8_5_]]#1, [[VAR_11_11_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_1_1_:%.+]] = vector.load [[VAR_reshape_75_]]{{.}}[[VAR_11_11_]]{{.}} : memref<93xf32>, vector<32xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_1_:%.+]] = vector.load [[VAR_reshape_71_]]{{.}}[[VAR_8_5_]]#0, [[VAR_8_5_]]#1, [[VAR_11_11_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_1_1_:%.+]] = vector.load [[VAR_reshape_73_]]{{.}}[[VAR_11_11_]]{{.}} : memref<93xf32>, vector<32xf32> // CHECK: [[LOAD_VAR_reshape_MEM_6_1_1_1_1_:%.+]] = arith.addf [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_1_]], [[LOAD_VAR_reshape_MEM_5_1_1_1_1_]] : vector<32xf32> -// CHECK: vector.store [[LOAD_VAR_reshape_MEM_6_1_1_1_1_]], [[VAR_reshape_77_]]{{.}}[[VAR_8_5_]]#0, [[VAR_8_5_]]#1, [[VAR_11_11_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> +// CHECK: vector.store [[LOAD_VAR_reshape_MEM_6_1_1_1_1_]], [[VAR_reshape_75_]]{{.}}[[VAR_8_5_]]#0, [[VAR_8_5_]]#1, [[VAR_11_11_]]{{.}} : memref<2x64x93xf32>, vector<32xf32> // CHECK: } // CHECK: [[LOOP_19_:%.+]] = krnl.define_loops 1 // CHECK: krnl.iterate([[LOOP_19_]]) with ([[LOOP_19_]] -> [[I_27_:%.+]] = 64 to 93){ // CHECK: [[VAR_11_12_:%.+]] = krnl.get_induction_var_value([[LOOP_19_]]) : (!krnl.loop) -> index -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_1_:%.+]] = krnl.load [[VAR_reshape_73_]]{{.}}[[VAR_8_5_]]#0, [[VAR_8_5_]]#1, [[VAR_11_12_]]{{.}} : memref<2x64x93xf32> -// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_1_1_1_:%.+]] = krnl.load [[VAR_reshape_75_]]{{.}}[[VAR_11_12_]]{{.}} : memref<93xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_1_:%.+]] = krnl.load [[VAR_reshape_71_]]{{.}}[[VAR_8_5_]]#0, [[VAR_8_5_]]#1, [[VAR_11_12_]]{{.}} : memref<2x64x93xf32> +// CHECK-DAG: [[LOAD_VAR_reshape_MEM_5_1_1_1_1_1_:%.+]] = krnl.load [[VAR_reshape_73_]]{{.}}[[VAR_11_12_]]{{.}} : memref<93xf32> // CHECK: [[LOAD_VAR_reshape_MEM_6_1_1_1_1_:%.+]] = arith.addf [[LOAD_VAR_reshape_MEM_4_1_1_1_1_1_1_]], [[LOAD_VAR_reshape_MEM_5_1_1_1_1_1_]] : f32 -// CHECK: krnl.store [[LOAD_VAR_reshape_MEM_6_1_1_1_1_]], [[VAR_reshape_77_]]{{.}}[[VAR_8_5_]]#0, [[VAR_8_5_]]#1, [[VAR_11_12_]]{{.}} : memref<2x64x93xf32> +// CHECK: krnl.store [[LOAD_VAR_reshape_MEM_6_1_1_1_1_]], [[VAR_reshape_75_]]{{.}}[[VAR_8_5_]]#0, [[VAR_8_5_]]#1, [[VAR_11_12_]]{{.}} : memref<2x64x93xf32> // CHECK: } // CHECK: } // CHECK: onnx.Return [[VAR_6_]] : tensor<2x64x31x3xf32>