From b3d52307270dd7c39704c233c733d46ccc808447 Mon Sep 17 00:00:00 2001 From: Djordje Ramic Date: Tue, 12 Nov 2024 15:14:13 +0000 Subject: [PATCH] Corrected tests --- .../Rock/Tuning/QuickTuningPerfconfigs.inc | 97 ++++++++++--------- .../Dialect/Rock/affix_tuning_params.mlir | 70 ++++++------- .../Dialect/Rock/test_packed_arithmetic.mlir | 11 +-- .../fusion/rock-gemm-reduce-align-tiling.mlir | 74 +++++++------- .../fusion/tosa-to-rock-gemm-reshape-add.mlir | 10 +- mlir/test/fusion/tosa-to-rock-tp-add-tp.mlir | 2 +- mlir/test/fusion/tosa-to-rock-tp-add.mlir | 2 +- 7 files changed, 136 insertions(+), 130 deletions(-) diff --git a/mlir/include/mlir/Dialect/Rock/Tuning/QuickTuningPerfconfigs.inc b/mlir/include/mlir/Dialect/Rock/Tuning/QuickTuningPerfconfigs.inc index d8ea8f9815d4..110184127e63 100644 --- a/mlir/include/mlir/Dialect/Rock/Tuning/QuickTuningPerfconfigs.inc +++ b/mlir/include/mlir/Dialect/Rock/Tuning/QuickTuningPerfconfigs.inc @@ -24,26 +24,30 @@ const InitParamsNonAccel PopulateParams::initParametersGemm[PopulateParams::nIni // BEGIN_CONV_NonAccel_f32_DEFS const InitParamsNonAccel PopulateParams::initParametersConv[PopulateParams::nInitParametersConv] = { - {128,128,128,8,4,2,1}, + {128,128,32,16,2,4,1}, + {64,64,128,16,2,2,1}, + {64,128,32,8,2,2,1}, + {128,32,32,16,2,2,1}, + {64,64,128,16,4,4,1}, + {64,32,64,4,2,2,1}, + {64,128,64,16,2,4,1}, + {64,32,64,8,2,2,1}, + {128,32,64,8,2,2,1}, + {256,32,32,16,2,2,1}, {64,32,32,16,4,4,1}, - {128,64,32,16,4,4,1}, + {64,64,128,4,4,4,1}, + {128,128,128,4,2,4,1}, + {128,64,32,16,2,2,1}, + {64,128,64,16,4,2,1}, + {256,128,128,8,2,4,1}, {64,32,32,8,2,4,1}, - {64,64,32,16,4,2,1}, - {64,128,64,16,2,4,1}, - {128,128,128,16,2,2,1}, - {64,64,64,8,2,2,1}, - {128,32,32,16,2,2,1}, - {64,128,64,16,2,2,1}, - {128,64,128,4,2,2,1}, - {128,32,128,4,2,4,1}, - {128,32,64,8,2,4,1}, + {128,64,32,4,2,2,1}, + {128,128,64,4,2,2,1}, + {128,128,128,16,2,4,1}, + {64,128,32,16,2,4,1}, {64,64,32,8,2,2,1}, - {128,32,64,4,2,2,1}, - {256,128,128,8,4,2,1}, - {64,128,32,16,2,2,1}, - {64,128,32,8,2,2,1}, - {256,32,32,16,2,2,1}, - {128,128,128,4,2,4,1} + {64,64,64,16,2,4,1}, + {64,32,128,8,2,4,1} }; // END_CONV_NonAccel_f32_DEFS @@ -57,7 +61,7 @@ static const InitParamsNonAccel initParametersGemm[nInitParametersGemm]; // END_GEMM_NonAccel_f32_DECS // BEGIN_CONV_NonAccel_f32_DECS -static constexpr size_t nInitParametersConv = 20; +static constexpr size_t nInitParametersConv = 24; static const InitParamsNonAccel initParametersConv[nInitParametersConv]; // END_CONV_NonAccel_f32_DECS @@ -255,28 +259,32 @@ const InitParamsAccel PopulateParamsWmma::initParametersFp16Gemm[PopulateParamsW // BEGIN_CONV_Wmma_f16_DEFS const InitParamsAccel PopulateParamsWmma::initParametersFp16Conv[PopulateParamsWmma::nInitParametersFp16Conv] = { - {128,64,4,64,64,8,1,true,true}, - {64,256,2,64,64,8,1,true,true}, + {16,16,4,16,16,8,1,true,true}, + {256,128,8,128,32,8,1,true,true}, + {256,64,2,64,64,8,1,true,true}, + {64,64,4,32,32,8,1,true,true}, {128,128,2,32,32,8,1,true,true}, + {64,16,8,16,16,16,1,true,true}, {128,64,8,32,64,8,1,true,true}, - {128,32,2,32,32,8,1,true,true}, - {64,16,8,16,16,8,1,true,true}, - {128,128,4,32,64,8,1,true,true}, - {256,64,8,64,32,8,1,true,true}, - {256,256,8,32,64,8,1,true,true}, - {128,64,2,64,64,8,1,true,true}, - {16,32,4,16,16,16,1,true,true}, - {64,32,8,64,16,8,1,true,true}, - {128,256,4,128,32,8,1,true,true}, - {128,16,8,32,16,8,1,true,true}, - {16,16,4,16,16,8,1,true,true}, + {256,256,8,64,32,8,1,true,true}, + {64,128,8,64,32,8,1,true,true}, + {128,64,2,64,32,8,1,true,true}, {128,256,2,64,32,8,1,true,true}, {16,16,8,16,16,8,1,true,true}, - {256,64,4,32,64,8,1,true,true}, + {128,32,2,32,32,8,1,true,true}, + {128,256,8,128,32,8,1,true,true}, + {32,128,2,32,32,8,1,true,true}, + {64,256,4,32,64,8,1,true,true}, + {64,32,8,32,32,8,1,true,true}, + {64,256,2,64,64,8,1,true,true}, + {16,32,4,16,16,16,1,true,true}, {16,32,4,16,32,8,1,true,true}, + {64,16,8,16,16,8,1,true,true}, + {256,128,4,32,64,8,1,true,true}, + {128,256,4,64,32,8,1,true,true}, + {128,128,4,64,64,8,1,true,true}, {16,128,8,16,16,8,1,true,true}, - {64,16,8,16,16,16,1,true,true}, - {256,128,8,32,32,8,1,true,true} + {128,16,8,32,16,8,1,true,true} }; // END_CONV_Wmma_f16_DEFS @@ -303,14 +311,15 @@ const InitParamsAccel PopulateParamsWmma::initParametersForward8BitGemm[Populate // BEGIN_CONV_Wmma_i8_DEFS const InitParamsAccel PopulateParamsWmma::initParametersForward8BitConv[PopulateParamsWmma::nInitParametersForward8BitConv] = { {128,64,8,32,64,16,1,true,true}, - {128,128,8,16,128,8,1,true,true}, - {64,256,4,64,32,16,1,true,true}, - {256,32,8,64,32,16,1,true,true}, - {128,16,8,32,16,16,1,true,true}, - {128,128,8,16,32,4,1,true,true}, - {128,64,4,32,32,4,1,true,true}, - {64,256,8,32,64,16,1,true,true}, - {16,128,4,16,16,16,1,true,true}, + {32,64,4,32,32,16,1,true,true}, + {64,64,4,64,16,16,1,true,true}, + {256,64,8,32,64,16,1,true,true}, + {128,32,4,64,16,16,1,true,true}, + {256,32,8,32,16,4,1,true,true}, + {32,256,4,32,16,4,1,true,true}, + {128,128,4,128,16,16,1,true,true}, + {64,16,8,32,16,16,1,true,true}, + {128,128,2,128,32,16,1,true,true}, {256,128,2,32,128,16,1,true,true} }; // END_CONV_Wmma_i8_DEFS @@ -325,7 +334,7 @@ static const InitParamsAccel initParametersFp16Gemm[nInitParametersFp16Gemm]; // END_GEMM_Wmma_f16_DECS // BEGIN_CONV_Wmma_f16_DECS -static constexpr size_t nInitParametersFp16Conv = 22; +static constexpr size_t nInitParametersFp16Conv = 26; static const InitParamsAccel initParametersFp16Conv[nInitParametersFp16Conv]; // END_CONV_Wmma_f16_DECS @@ -335,7 +344,7 @@ static const InitParamsAccel initParametersForward8BitGemm[nInitParametersForwar // END_GEMM_Wmma_i8_DECS // BEGIN_CONV_Wmma_i8_DECS -static constexpr size_t nInitParametersForward8BitConv = 10; +static constexpr size_t nInitParametersForward8BitConv = 11; static const InitParamsAccel initParametersForward8BitConv[nInitParametersForward8BitConv]; // END_CONV_Wmma_i8_DECS diff --git a/mlir/test/Dialect/Rock/affix_tuning_params.mlir b/mlir/test/Dialect/Rock/affix_tuning_params.mlir index 4211056337f7..499785a59a39 100644 --- a/mlir/test/Dialect/Rock/affix_tuning_params.mlir +++ b/mlir/test/Dialect/Rock/affix_tuning_params.mlir @@ -10,9 +10,9 @@ // GRID-LABEL: rock_conv func.func @rock_conv(%filter : memref<1x128x8x3x3xf32>, %input : memref<128x1x8x32x32xf32>, %output : memref<128x1x128x30x30xf32>) { // CHECK: rock.conv - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 900 + // GRID-SAME: gridSize = 3600 rock.conv(%filter, %input, %output) features = none { arch = "amdgcn-amd-amdhsa:gfx906", filter_layout = ["g", "k", "c", "0", "1"], @@ -29,9 +29,9 @@ func.func @rock_conv(%filter : memref<1x128x8x3x3xf32>, %input : memref<128x1x8x // GRID-LABEL: func.func @rock_conv_f16 func.func @rock_conv_f16(%filter : memref<1x128x8x3x3xf16>, %input : memref<128x1x8x32x32xf16>, %output : memref<128x1x128x30x30xf16>) { // CHECK: rock.conv - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 900 + // GRID-SAME: gridSize = 3600 rock.conv(%filter, %input, %output) features = none { arch = "amdgcn-amd-amdhsa:gfx906", filter_layout = ["g", "k", "c", "0", "1"], @@ -49,9 +49,9 @@ func.func @rock_conv_f16(%filter : memref<1x128x8x3x3xf16>, %input : memref<128x func.func @rock_conv_i8(%filter : memref<1x128x8x3x3xi8>, %input : memref<128x1x8x32x32xi8>, %output : memref<128x1x128x30x30xi32>) { // CHECK: rock.conv // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.xdlops_gemm_derived_params + // CHECK-SAME: params = #rock.xdlops_gemm_derived_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 450 + // GRID-SAME: gridSize = 900 rock.conv(%filter, %input, %output) features = mfma|dot|atomic_add { arch = "amdgcn-amd-amdhsa:gfx908", filter_layout = ["g", "k", "c", "0", "1"], @@ -69,9 +69,9 @@ func.func @rock_conv_i8(%filter : memref<1x128x8x3x3xi8>, %input : memref<128x1x func.func @rock_conv_bwd_data(%filter: memref<1x1024x1024x1x1xf32>, %input: memref<128x1x1024x14x14xf32>, %output: memref<128x1x1024x14x14xf32>) attributes {kernel = 0 : i32} { // CHECK: rock.conv_bwd_data // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.xdlops_gemm_derived_params + // CHECK-SAME: params = #rock.xdlops_gemm_derived_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 392 + // GRID-SAME: gridSize = 6272 rock.conv_bwd_data(%filter, %input, %output) features = mfma|dot|atomic_add { arch = "amdgcn-amd-amdhsa:gfx908", dilations = [1 : index, 1 : index], @@ -90,9 +90,9 @@ func.func @rock_conv_bwd_data(%filter: memref<1x1024x1024x1x1xf32>, %input: memr func.func @rock_conv_bwd_data_f16(%filter: memref<1x1024x1024x1x1xf16>, %input: memref<128x1x1024x14x14xf16>, %output: memref<128x1x1024x14x14xf16>) attributes {kernel = 0 : i32} { // CHECK: rock.conv_bwd_data // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.xdlops_gemm_derived_params + // CHECK-SAME: params = #rock.xdlops_gemm_derived_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 784 + // GRID-SAME: gridSize = 1568 rock.conv_bwd_data(%filter, %input, %output) features = mfma|dot|atomic_add { arch = "amdgcn-amd-amdhsa:gfx908", dilations = [1 : index, 1 : index], @@ -130,7 +130,7 @@ func.func @rock_conv_bwd_data_padMN(%filter : memref<1x64x3x1x1xf32>, %input : m // GRID-LABEL: @rock_conv_bwd_data_padMK func.func @rock_conv_bwd_data_padMK(%filter : memref<1x11x3x1x1xf32>, %input : memref<128x1x3x15x15xf32>, %output : memref<128x1x11x15x15xf32>) { // CHECK: rock.conv_bwd_data - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm // GRID-SAME: gridSize = 450 rock.conv_bwd_data(%filter, %input, %output) features = none { @@ -150,9 +150,9 @@ func.func @rock_conv_bwd_data_padMK(%filter : memref<1x11x3x1x1xf32>, %input : m // GRID-LABEL: @rock_conv_bwd_weight func.func @rock_conv_bwd_weight(%filter : memref<1x128x8x3x3xf32>, %input : memref<128x1x8x32x32xf32>, %output : memref<128x1x128x30x30xf32>) { // CHECK: rock.conv_bwd_weight - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 12 + // GRID-SAME: gridSize = 3 rock.conv_bwd_weight(%filter, %input, %output) features = none { arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32, @@ -170,9 +170,9 @@ func.func @rock_conv_bwd_weight(%filter : memref<1x128x8x3x3xf32>, %input : memr // GRID-LABEL: @rock_conv_bwd_weight_f16 func.func @rock_conv_bwd_weight_f16(%filter : memref<1x128x8x3x3xf16>, %input : memref<128x1x8x32x32xf16>, %output : memref<128x1x128x30x30xf16>) { // CHECK: rock.conv_bwd_weight - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 12 + // GRID-SAME: gridSize = 3 rock.conv_bwd_weight(%filter, %input, %output) features = none { arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32, @@ -190,7 +190,7 @@ func.func @rock_conv_bwd_weight_f16(%filter : memref<1x128x8x3x3xf16>, %input : // GRID-LABEL: func.func @rock_conv_bwd_weight_padALL func.func @rock_conv_bwd_weight_padALL(%filter : memref<1x20x8x3x3xf32>, %input : memref<7x1x8x32x32xf32>, %output : memref<7x1x20x30x30xf32>) { // CHECK: rock.conv_bwd_weight - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm // GRID-SAME: gridSize = 3 rock.conv_bwd_weight(%filter, %input, %output) features = none { @@ -210,7 +210,7 @@ func.func @rock_conv_bwd_weight_padALL(%filter : memref<1x20x8x3x3xf32>, %input // GRID-LABEL: @rock_conv_bwd_weight_padALL_f16 func.func @rock_conv_bwd_weight_padALL_f16(%filter : memref<1x20x8x3x3xf16>, %input : memref<7x1x8x32x32xf16>, %output : memref<7x1x20x30x30xf16>) { // CHECK: rock.conv_bwd_weight - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm // GRID-SAME: gridSize = 3 rock.conv_bwd_weight(%filter, %input, %output) features = none { @@ -254,9 +254,9 @@ func.func @rock_conv_7x7_tuning(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<256 func.func @rock_conv_7x7(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<256x1x3x230x230xf32>, %arg2: memref<256x1x64x112x112xf32>) { // CHECK: rock.conv // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.xdlops_gemm_derived_params + // CHECK-SAME: params = #rock.xdlops_gemm_derived_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 25088 + // GRID-SAME: gridSize = 12544 rock.conv(%arg0, %arg1, %arg2) features = mfma|dot|atomic_add { arch = "amdgcn-amd-amdhsa:gfx908", dilations = [1 : index, 1 : index], @@ -273,10 +273,10 @@ func.func @rock_conv_7x7(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<256x1x3x23 // GRID-LABEL: @rock_conv_bwd_weight_7x7 func.func @rock_conv_bwd_weight_7x7(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<256x1x3x230x230xf32>, %arg2: memref<256x1x64x112x112xf32>) attributes {kernel = 0 : i32} { // CHECK: rock.conv_bwd_weight - // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.xdlops_gemm_derived_params + // CHECK-SAME: derivedBlockSize = 128 + // CHECK-SAME: params = #rock.xdlops_gemm_derived_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 5 + // GRID-SAME: gridSize = 20 rock.conv_bwd_weight(%arg0, %arg1, %arg2) features = mfma|dot|atomic_add { arch = "amdgcn-amd-amdhsa:gfx908", dilations = [1 : index, 1 : index], @@ -316,10 +316,10 @@ func.func @rock_conv_bwd_data_7x7_tuning(%arg0: memref<1x64x3x7x7xf32>, %arg1: m // GRID-LABEL: @rock_conv_bwd_data_7x7 func.func @rock_conv_bwd_data_7x7(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<256x1x3x230x230xf32>, %arg2: memref<256x1x64x112x112xf32>) attributes {kernel = 1 : i32} { // CHECK: rock.conv_bwd_data - // CHECK-SAME: derivedBlockSize = 128 - // CHECK-SAME: params = #rock.xdlops_gemm_derived_params + // CHECK-SAME: derivedBlockSize = 64 + // CHECK-SAME: params = #rock.xdlops_gemm_derived_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 105800 + // GRID-SAME: gridSize = 211600 rock.conv_bwd_data(%arg0, %arg1, %arg2) features = mfma|dot|atomic_add { arch = "amdgcn-amd-amdhsa:gfx908", dilations = [1 : index, 1 : index], @@ -337,9 +337,9 @@ func.func @rock_conv_bwd_data_7x7(%arg0: memref<1x64x3x7x7xf32>, %arg1: memref<2 // GRID-LABEL: @rock_gemm_from_conv func.func @rock_gemm_from_conv(%a : memref<1x72x128xf32>, %b : memref<1x72x115200xf32>, %c : memref<1x128x115200xf32>) { // CHECK: rock.gemm - // CHECK-SAME: params = #rock.general_gemm_params + // CHECK-SAME: params = #rock.general_gemm_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 900 + // GRID-SAME: gridSize = 7200 rock.gemm %c = tr %a * %b features = none storeMethod = set { arch = "amdgcn-amd-amdhsa:gfx906", numCU = 64 : i32 @@ -352,9 +352,9 @@ func.func @rock_gemm_from_conv(%a : memref<1x72x128xf32>, %b : memref<1x72x11520 func.func @rock_gemm_from_i8_conv(%a : memref<1x72x128xi8>, %b : memref<1x72x115200xi8>, %c : memref<1x128x115200xi32>) { // CHECK: rock.gemm // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.xdlops_gemm_derived_params + // CHECK-SAME: params = #rock.xdlops_gemm_derived_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 450 + // GRID-SAME: gridSize = 3600 rock.gemm %c = tr %a * %b features = mfma|dot|atomic_add storeMethod = set { arch = "amdgcn-amd-amdhsa:gfx908", numCU = 120 : i32 @@ -370,9 +370,9 @@ func.func @rock_gemm_from_i8_conv(%a : memref<1x72x128xi8>, %b : memref<1x72x115 func.func @rock_gemm_from_i8_conv_gfx940(%a : memref<1x72x128xi8>, %b : memref<1x72x115200xi8>, %c : memref<1x128x115200xi32>) { // CHECK: rock.gemm // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.xdlops_gemm_derived_params + // CHECK-SAME: params = #rock.xdlops_gemm_derived_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 3600 + // GRID-SAME: gridSize = 1800 rock.gemm %c = tr %a * %b features = mfma|dot|atomic_add storeMethod = set { arch = "amdgcn-amd-amdhsa:gfx940", numCU = 120 : i32 @@ -386,9 +386,9 @@ func.func @rock_gemm_from_i8_conv_gfx940(%a : memref<1x72x128xi8>, %b : memref<1 func.func @rock_gemm_xdlops_fp8_bf8(%a : memref<1x72x128xf8E4M3FNUZ>, %b : memref<1x72x115200xf8E5M2FNUZ>, %c : memref<1x128x115200xf32>) { // CHECK: rock.gemm // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.xdlops_gemm_derived_params + // CHECK-SAME: params = #rock.xdlops_gemm_derived_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 3600 + // GRID-SAME: gridSize = 1800 rock.gemm %c = tr %a * %b features = mfma|dot|atomic_add storeMethod = set { arch = "amdgcn-amd-amdhsa:gfx940", numCU = 120 : i32 @@ -402,9 +402,9 @@ func.func @rock_gemm_xdlops_fp8_bf8(%a : memref<1x72x128xf8E4M3FNUZ>, %b : memre func.func @rock_gemm_xdlops_fp8_bf8_ocp(%a : memref<1x72x128xf8E4M3FN>, %b : memref<1x72x115200xf8E5M2>, %c : memref<1x128x115200xf32>) { // CHECK: rock.gemm // CHECK-SAME: derivedBlockSize = 256 - // CHECK-SAME: params = #rock.xdlops_gemm_derived_params + // CHECK-SAME: params = #rock.xdlops_gemm_derived_params // GRID: rock.gridwise_gemm - // GRID-SAME: gridSize = 3600 + // GRID-SAME: gridSize = 1800 rock.gemm %c = tr %a * %b features = mfma|dot|atomic_add storeMethod = set { arch = "amdgcn-amd-amdhsa:gfx940", numCU = 120 : i32 diff --git a/mlir/test/Dialect/Rock/test_packed_arithmetic.mlir b/mlir/test/Dialect/Rock/test_packed_arithmetic.mlir index 1e266c9b1629..eac3cfea1aa2 100644 --- a/mlir/test/Dialect/Rock/test_packed_arithmetic.mlir +++ b/mlir/test/Dialect/Rock/test_packed_arithmetic.mlir @@ -18,15 +18,12 @@ // VECTORIZE: vector.transfer_write %[[trunc]] // ROCDL: %[[pkrtz:.*]] = rocdl.cvt.pkrtz {{.*}}, {{.*}} : vector<2xf16> // ROCDL: llvm.store %[[pkrtz]], {{.*}} : vector<2xf16>, !llvm.ptr<5> -// LLVM: %[[extract0:.*]] = extractelement <16 x float> {{.*}}, i64 0 -// LLVM: %[[extract1:.*]] = extractelement <16 x float> {{.*}}, i64 1 +// LLVM: %[[extract0:.*]] = extractelement <4 x float> {{.*}}, i64 0 +// LLVM: %[[extract1:.*]] = extractelement <4 x float> {{.*}}, i64 1 // LLVM: tail call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %[[extract0]], float %[[extract1]]) -// LLVM: %[[extract2:.*]] = extractelement <16 x float> {{.*}}, i64 2 -// LLVM: %[[extract3:.*]] = extractelement <16 x float> {{.*}}, i64 3 +// LLVM: %[[extract2:.*]] = extractelement <4 x float> {{.*}}, i64 2 +// LLVM: %[[extract3:.*]] = extractelement <4 x float> {{.*}}, i64 3 // LLVM: tail call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %[[extract2]], float %[[extract3]]) -// LLVM: %[[extract14:.*]] = extractelement <16 x float> {{.*}}, i64 14 -// LLVM: %[[extract15:.*]] = extractelement <16 x float> {{.*}}, i64 15 -// LLVM: tail call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %[[extract14]], float %[[extract15]]) // ASM: v_pk_add_f16 {{.*}}, {{.*}}, {{.*}} module { func.func @test_fusion(%arg0: memref<1x128x128xf16> {mhal.read_access}, %arg1: memref<1x128x128xf16> {mhal.read_access}, %arg2: memref<1x128x128xf16> {mhal.read_access}, %arg3: memref<1x128x128xf16> {mhal.write_access}) attributes {arch = "gfx942", kernel} { diff --git a/mlir/test/fusion/rock-gemm-reduce-align-tiling.mlir b/mlir/test/fusion/rock-gemm-reduce-align-tiling.mlir index 95bad400cdfc..640e604487cd 100644 --- a/mlir/test/fusion/rock-gemm-reduce-align-tiling.mlir +++ b/mlir/test/fusion/rock-gemm-reduce-align-tiling.mlir @@ -7,16 +7,16 @@ func.func @test_gemm_reduce_last_axis_fusion(%arg0: memref<1x128x64xf32>, %arg1: // CHECK: rock.blockwise_broadcast_reduce sum {{.*}} into %[[BLOCK_RED_OUT:[0-9]+]] // CHECK: %[[TR0:.+]] = rock.transform %arg2 by {{.*}} : memref<1x128x1xf32> to memref<1x128x256xf32> - // CHECK: %[[TR1:.+]] = rock.transform %[[TR0]] by {{.*}} : memref<1x128x256xf32> to memref<2x128x2x128xf32> - // CHECK: %[[TR2:.+]] = rock.transform %[[TR1]] by {{.*}} : memref<2x128x2x128xf32> to memref<2x1x2x1x128x128xf32> - // CHECK: %[[TR3:.+]] = rock.transform %[[TR2]] by {{.*}} : memref<2x1x2x1x128x128xf32> to memref<2x1x2x128x1xf32> - // CHECK: %[[TR4:.+]] = rock.transform %[[TR3]] by {{.*}} ["dim1"] at [4]>{{.*}} : memref<2x1x2x128x1xf32> to memref<2x1x2x128x128xf32> + // CHECK: %[[TR1:.+]] = rock.transform %[[TR0]] by {{.*}} : memref<1x128x256xf32> to memref<16x2x64x8x32xf32> + // CHECK: %[[TR2:.+]] = rock.transform %[[TR1]] by {{.*}} : memref<16x2x64x8x32xf32> to memref<16x2x8x1x64x32xf32> + // CHECK: %[[TR3:.+]] = rock.transform %[[TR2]] by {{.*}} : memref<16x2x8x1x64x32xf32> to memref<16x2x8x64x1xf32> + // CHECK: %[[TR4:.+]] = rock.transform %[[TR3]] by {{.*}} ["dim1"] at [4]>{{.*}} : memref<16x2x8x64x1xf32> to memref<16x2x8x64x32xf32> - // CHECK: %[[TR5:.+]] = rock.transform %[[TR4]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR6:.+]] = rock.transform %[[TR5]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR7:.+]] = rock.transform %[[TR6]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR8:.+]] = rock.transform %[[TR7]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x4x4x4x4x2x4x2x4xf32 - // CHECK: %[[TR9:.+]] = rock.transform %[[TR8]] by {{.*}} : memref<2x1x2x4x4x4x4x2x4x2x4xf32> to memref<2x1x2x256x64xf32> + // CHECK: %[[TR5:.+]] = rock.transform %[[TR4]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR6:.+]] = rock.transform %[[TR5]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR7:.+]] = rock.transform %[[TR6]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR8:.+]] = rock.transform %[[TR7]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x2x2x4x4x4x2x2x2xf32> + // CHECK: %[[TR9:.+]] = rock.transform %[[TR8]] by {{.*}} : memref<16x2x8x2x2x4x4x4x2x2x2xf32> to memref<16x2x8x64x32xf32> // CHECK: rock.threadwise_write_all {{.*}}%[[BLOCK_RED_OUT]] -> [](%[[TR9]]){{.*}} by atomic_add : {{.*}} rock.reduce sum %0 into %arg2 features = mfma|dot|atomic_add {axis = 2 : index, blockSize = 256 : i32, gridSize = 1 : i32} : memref<1x128x256xf32> into memref<1x128x1xf32> @@ -31,16 +31,16 @@ func.func @test_gemm_reduce_middle_axis_fusion(%arg0: memref<1x128x64xf32>, %arg // CHECK: rock.blockwise_broadcast_reduce sum {{.*}} into %[[BLOCK_RED_OUT:[0-9]+]] // CHECK: %[[TR0:.+]] = rock.transform %arg2 by {{.*}} : memref<1x1x256xf32> to memref<1x128x256xf32> - // CHECK: %[[TR1:.+]] = rock.transform %[[TR0]] by {{.*}} : memref<1x128x256xf32> to memref<2x128x2x128xf32> - // CHECK: %[[TR2:.+]] = rock.transform %[[TR1]] by {{.*}} : memref<2x128x2x128xf32> to memref<2x1x2x1x128x128xf32> - // CHECK: %[[TR3:.+]] = rock.transform %[[TR2]] by {{.*}} : memref<2x1x2x1x128x128xf32> to memref<2x1x2x1x128xf32> - // CHECK: %[[TR4:.+]] = rock.transform %[[TR3]] by {{.*}} ["dim0"] at [3]>{{.*}} : memref<2x1x2x1x128xf32> to memref<2x1x2x128x128xf32> + // CHECK: %[[TR1:.+]] = rock.transform %[[TR0]] by {{.*}} : memref<1x128x256xf32> to memref<16x2x64x8x32xf32> + // CHECK: %[[TR2:.+]] = rock.transform %[[TR1]] by {{.*}} : memref<16x2x64x8x32xf32> to memref<16x2x8x1x64x32xf32> + // CHECK: %[[TR3:.+]] = rock.transform %[[TR2]] by {{.*}} : memref<16x2x8x1x64x32xf32> to memref<16x2x8x1x32xf32> + // CHECK: %[[TR4:.+]] = rock.transform %[[TR3]] by {{.*}} ["dim0"] at [3]>{{.*}} : memref<16x2x8x1x32xf32> to memref<16x2x8x64x32xf32> - // CHECK: %[[TR5:.+]] = rock.transform %[[TR4]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR6:.+]] = rock.transform %[[TR5]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR7:.+]] = rock.transform %[[TR6]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR8:.+]] = rock.transform %[[TR7]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x4x4x4x4x2x4x2x4xf32> - // CHECK: %[[TR9:.+]] = rock.transform %[[TR8]] by {{.*}} : memref<2x1x2x4x4x4x4x2x4x2x4xf32> to memref<2x1x2x256x64xf32> + // CHECK: %[[TR5:.+]] = rock.transform %[[TR4]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR6:.+]] = rock.transform %[[TR5]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR7:.+]] = rock.transform %[[TR6]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR8:.+]] = rock.transform %[[TR7]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x2x2x4x4x4x2x2x2xf32> + // CHECK: %[[TR9:.+]] = rock.transform %[[TR8]] by {{.*}} : memref<16x2x8x2x2x4x4x4x2x2x2xf32> to memref<16x2x8x64x32xf32> // CHECK: rock.threadwise_write_all {{.*}}%[[BLOCK_RED_OUT]] -> [](%[[TR9]]){{.*}} by atomic_add : {{.*}} rock.reduce sum %0 into %arg2 features = mfma|dot|atomic_add {axis = 1 : index, blockSize = 256 : i32, gridSize = 1 : i32} : memref<1x128x256xf32> into memref<1x1x256xf32> @@ -61,16 +61,16 @@ func.func @test_gemm_add_reduce_fusion(%arg0: memref<1x128x64xf32>, %arg1: memre // CHECK: rock.blockwise_broadcast_reduce sum {{.*}} into %[[BLOCK_RED_OUT:[0-9]+]] // CHECK: %[[TR0:.+]] = rock.transform %arg3 by {{.*}} : memref<1x128x1xf32> to memref<1x128x256xf32> - // CHECK: %[[TR1:.+]] = rock.transform %[[TR0]] by {{.*}} : memref<1x128x256xf32> to memref<2x128x2x128xf32> - // CHECK: %[[TR2:.+]] = rock.transform %[[TR1]] by {{.*}} : memref<2x128x2x128xf32> to memref<2x1x2x1x128x128xf32> - // CHECK: %[[TR3:.+]] = rock.transform %[[TR2]] by {{.*}} : memref<2x1x2x1x128x128xf32> to memref<2x1x2x128x1xf32> - // CHECK: %[[TR4:.+]] = rock.transform %[[TR3]] by {{.*}} ["dim1"] at [4]>{{.*}} : memref<2x1x2x128x1xf32> to memref<2x1x2x128x128xf32> + // CHECK: %[[TR1:.+]] = rock.transform %[[TR0]] by {{.*}} : memref<1x128x256xf32> to memref<16x2x64x8x32xf32> + // CHECK: %[[TR2:.+]] = rock.transform %[[TR1]] by {{.*}} : memref<16x2x64x8x32xf32> to memref<16x2x8x1x64x32xf32> + // CHECK: %[[TR3:.+]] = rock.transform %[[TR2]] by {{.*}} : memref<16x2x8x1x64x32xf32> to memref<16x2x8x64x1xf32> + // CHECK: %[[TR4:.+]] = rock.transform %[[TR3]] by {{.*}} ["dim1"] at [4]>{{.*}} : memref<16x2x8x64x1xf32> to memref<16x2x8x64x32xf32> - // CHECK: %[[TR5:.+]] = rock.transform %[[TR4]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR6:.+]] = rock.transform %[[TR5]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR7:.+]] = rock.transform %[[TR6]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR8:.+]] = rock.transform %[[TR7]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x4x4x4x4x2x4x2x4xf32> - // CHECK: %[[TR9:.+]] = rock.transform %[[TR8]] by {{.*}} : memref<2x1x2x4x4x4x4x2x4x2x4xf32> to memref<2x1x2x256x64xf32> + // CHECK: %[[TR5:.+]] = rock.transform %[[TR4]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR6:.+]] = rock.transform %[[TR5]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR7:.+]] = rock.transform %[[TR6]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR8:.+]] = rock.transform %[[TR7]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x2x2x4x4x4x2x2x2xf32> + // CHECK: %[[TR9:.+]] = rock.transform %[[TR8]] by {{.*}} : memref<16x2x8x2x2x4x4x4x2x2x2xf32> to memref<16x2x8x64x32xf32> // CHECK: rock.threadwise_write_all {{.*}}%[[BLOCK_RED_OUT]] -> [](%[[TR9]]){{.*}} by atomic_add : {{.*}} rock.reduce sum %1 into %arg3 features = mfma|dot|atomic_add {axis = 2 : index, blockSize = 256 : i32, gridSize = 1 : i32} : memref<1x128x256xf32> into memref<1x128x1xf32> @@ -84,16 +84,16 @@ func.func @test_gemm_reduce_max(%arg0: memref<1x128x64xf32>, %arg1: memref<1x64x // CHECK: rock.blockwise_broadcast_reduce max {{.*}} into %[[BLOCK_RED_OUT:[0-9]+]] // CHECK: %[[TR0:.+]] = rock.transform %arg2 by {{.*}} : memref<1x128x1xf32> to memref<1x128x256xf32> - // CHECK: %[[TR1:.+]] = rock.transform %[[TR0]] by {{.*}} : memref<1x128x256xf32> to memref<2x128x2x128xf32> - // CHECK: %[[TR2:.+]] = rock.transform %[[TR1]] by {{.*}} : memref<2x128x2x128xf32> to memref<2x1x2x1x128x128xf32> - // CHECK: %[[TR3:.+]] = rock.transform %[[TR2]] by {{.*}} : memref<2x1x2x1x128x128xf32> to memref<2x1x2x128x1xf32> - // CHECK: %[[TR4:.+]] = rock.transform %[[TR3]] by {{.*}} ["dim1"] at [4]>{{.*}} : memref<2x1x2x128x1xf32> to memref<2x1x2x128x128xf32> - - // CHECK: %[[TR5:.+]] = rock.transform %[[TR4]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR6:.+]] = rock.transform %[[TR5]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR7:.+]] = rock.transform %[[TR6]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x128x128xf32> - // CHECK: %[[TR8:.+]] = rock.transform %[[TR7]] by {{.*}} : memref<2x1x2x128x128xf32> to memref<2x1x2x4x4x4x4x2x4x2x4xf32> - // CHECK: %[[TR9:.+]] = rock.transform %[[TR8]] by {{.*}} : memref<2x1x2x4x4x4x4x2x4x2x4xf32> to memref<2x1x2x256x64xf32> + // CHECK: %[[TR1:.+]] = rock.transform %[[TR0]] by {{.*}} : memref<1x128x256xf32> to memref<16x2x64x8x32xf32> + // CHECK: %[[TR2:.+]] = rock.transform %[[TR1]] by {{.*}} : memref<16x2x64x8x32xf32> to memref<16x2x8x1x64x32xf32> + // CHECK: %[[TR3:.+]] = rock.transform %[[TR2]] by {{.*}} : memref<16x2x8x1x64x32xf32> to memref<16x2x8x64x1xf32> + // CHECK: %[[TR4:.+]] = rock.transform %[[TR3]] by {{.*}} ["dim1"] at [4]>{{.*}} : memref<16x2x8x64x1xf32> to memref<16x2x8x64x32xf32> + + // CHECK: %[[TR5:.+]] = rock.transform %[[TR4]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR6:.+]] = rock.transform %[[TR5]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR7:.+]] = rock.transform %[[TR6]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x64x32xf32> + // CHECK: %[[TR8:.+]] = rock.transform %[[TR7]] by {{.*}} : memref<16x2x8x64x32xf32> to memref<16x2x8x2x2x4x4x4x2x2x2xf32> + // CHECK: %[[TR9:.+]] = rock.transform %[[TR8]] by {{.*}} : memref<16x2x8x2x2x4x4x4x2x2x2xf32> to memref<16x2x8x64x32xf32> // CHECK: rock.threadwise_write_all {{.*}}%[[BLOCK_RED_OUT]] -> [](%[[TR9]]){{.*}} by atomic_max : {{.*}} rock.reduce max %0 into %arg2 features = mfma|dot|atomic_add {axis = 2 : index, blockSize = 256 : i32, gridSize = 1 : i32} : memref<1x128x256xf32> into memref<1x128x1xf32> diff --git a/mlir/test/fusion/tosa-to-rock-gemm-reshape-add.mlir b/mlir/test/fusion/tosa-to-rock-gemm-reshape-add.mlir index 5d402f1c2c76..5def84e9c2f9 100644 --- a/mlir/test/fusion/tosa-to-rock-gemm-reshape-add.mlir +++ b/mlir/test/fusion/tosa-to-rock-gemm-reshape-add.mlir @@ -1,13 +1,13 @@ // RUN: rocmlir-driver --host-pipeline highlevel %s | rocmlir-opt --rock-affix-params --rock-conv-to-gemm --rock-gemm-to-gridwise -rock-regularize -rock-gridwise-gemm-to-blockwise -rock-linalg-align | FileCheck %s --check-prefix=CHECK_LINALG_ALIGN -// CHECK_LINALG_ALIGN-DAG: #[[AMAP:.*]] = affine_map<(d0, d1, d2) -> (d0 + d1, d2)> -// CHECK_LINALG_ALIGN-DAG: #[[AMAP1:.*]] = affine_map<(d0, d1) -> (d0 * 1000 + d1)> -// CHECK_LINALG_ALIGN-DAG: #[[MAP1:.*]] = #rock.transform_map<#[[AMAP]] by [ ["dim0"] at [0]>, ["dim1"] at [1]>] bounds = [1, 1, 1000] -> [1, 1000]> -// CHECK_LINALG_ALIGN-DAG: #[[MAP2:.*]] = #rock.transform_map<#[[AMAP1]] by [ ["dim0"] at [0]>] bounds = [1, 1000] -> [1000]> +// CHECK_LINALG_ALIGN-DAG: #[[AMAP:.*]] = affine_map<(d0, d1) -> (d0 * 1000 + d1)> +// CHECK_LINALG_ALIGN-DAG: #[[AMAP1:.*]] = affine_map<(d0, d1, d2) -> (d0 + d1, d2)> +// CHECK_LINALG_ALIGN-DAG: #[[MAP1:.*]] = #rock.transform_map<#[[AMAP]] by [ ["dim0"] at [0]>] bounds = [1, 1000] -> [1000]> +// CHECK_LINALG_ALIGN-DAG: #[[MAP2:.*]] = #rock.transform_map<#[[AMAP1]] by [ ["dim0"] at [0]>, ["dim1"] at [1]>] bounds = [1, 1, 1000] -> [1, 1000]> // CHECK_LINALG_ALIGN-COUNT-2: rock.threadwise_read_into {{.*}} // CHECK_LINALG_ALIGN: rock.threadwise_read_into {{.*}} -> [[lain:%.*]] : -// CHECK_LINALG_ALIGN: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<16xf32, #gpu.address_space>) +// CHECK_LINALG_ALIGN: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<64xf32, #gpu.address_space>) // CHECK_LINALG_ALIGN: rock.threadwise_write_all {{.*}} %[[outBuf]] -> // to test reshape is converted as transform and fused. diff --git a/mlir/test/fusion/tosa-to-rock-tp-add-tp.mlir b/mlir/test/fusion/tosa-to-rock-tp-add-tp.mlir index 5fe73d78d175..7b1c499461f0 100644 --- a/mlir/test/fusion/tosa-to-rock-tp-add-tp.mlir +++ b/mlir/test/fusion/tosa-to-rock-tp-add-tp.mlir @@ -3,7 +3,7 @@ // CHECK-DAG: #[[MAP2:.*]] = #rock.transform_map<{{.*}} by [ ["dim0", "dim2", "dim3", "dim1"] at [0, 2, 3, 1]>] bounds = [256, 28, 28, 64] -> [256, 64, 28, 28]> // CHECK-COUNT-2: rock.threadwise_read_into {{.*}} // CHECK: rock.threadwise_read_into {{.*}} -> [[lain:%.*]] : -// CHECK: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<16xf32, #gpu.address_space>) +// CHECK: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<128xf32, #gpu.address_space>) // CHECK: rock.threadwise_write_all {{.*}} %[[outBuf]] -> // to test transpose is converted as transform and fused. diff --git a/mlir/test/fusion/tosa-to-rock-tp-add.mlir b/mlir/test/fusion/tosa-to-rock-tp-add.mlir index b8b1f5109cad..fe5897d469e9 100644 --- a/mlir/test/fusion/tosa-to-rock-tp-add.mlir +++ b/mlir/test/fusion/tosa-to-rock-tp-add.mlir @@ -3,7 +3,7 @@ // CHECK-DAG: #[[MAP2:.*]] = #rock.transform_map<#map{{.*}} by [ ["{{.*}}", "{{.*}}", "{{.*}}", "{{.*}}"] at [0, 2, 3, 1]>] bounds = [256, 28, 28, 64] -> [256, 64, 28, 28]> // CHECK-COUNT-2: rock.threadwise_read_into {{.*}} // CHECK: rock.threadwise_read_into {{.*}} -> [[lain:%.*]] : -// CHECK: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<16xf32, #gpu.address_space>) +// CHECK: linalg.generic{{.*}} ins({{.*}}, [[lain]] :{{.*}}) outs(%[[outBuf:.*]] : memref<128xf32, #gpu.address_space>) // CHECK: rock.threadwise_write_all {{.*}} %[[outBuf]] -> // to test transpose is converted as transform and fused.