diff --git a/compiler/src/iree/compiler/Codegen/Common/ConvolutionToIGEMM.cpp b/compiler/src/iree/compiler/Codegen/Common/ConvolutionToIGEMM.cpp index 20c2e5ff1990..d3ceb7ea4271 100644 --- a/compiler/src/iree/compiler/Codegen/Common/ConvolutionToIGEMM.cpp +++ b/compiler/src/iree/compiler/Codegen/Common/ConvolutionToIGEMM.cpp @@ -41,7 +41,11 @@ struct SetIGEMMConfiguration final : OpRewritePattern { auto im2colOp = genericOp.getOperand(0).getDefiningOp(); if (!im2colOp) { - return rewriter.notifyMatchFailure(genericOp, "no im2colOp producer."); + im2colOp = + genericOp.getOperand(1).getDefiningOp(); + if (!im2colOp) { + return rewriter.notifyMatchFailure(genericOp, "no im2colOp producer."); + } } if (getLoweringConfig(genericOp)) { diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp index 14264d650fd4..25bcdea14568 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp @@ -1206,6 +1206,9 @@ static void buildROCDLCodegenConfigurationPassPipelineImpl( OpPassManager &modulePassManager) { { FunctionLikeNest funcPassManager(modulePassManager); + funcPassManager.addPredicatedPass(clLLVMGPUUseIgemm, []() { + return createConvolutionToIGEMMPass(igemmConfigFn); + }); funcPassManager.addPass(createGPUGeneralizeNamedOpsPass); addCommonTargetExecutablePreprocessingPasses(funcPassManager); } diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel index 925106ab4de0..245a533ded69 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel @@ -38,6 +38,7 @@ iree_lit_test_suite( "extract_address_computation_gpu.mlir", "gpu_set_num_workgroups.mlir", "gpu_pipeline_generalize_named_ops.mlir", + "gpu_pipeline_igemm.mlir", "nvvm_extract_address_computation.mlir", "nvvm_pipeline_test.mlir", "nvvm_mma_sync_pipeline_test.mlir", diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt index 0c279642cb5c..c59d89cf3f19 100644 --- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt @@ -35,6 +35,7 @@ iree_lit_test_suite( "elementwise_pipeline.mlir" "extract_address_computation_gpu.mlir" "gpu_pipeline_generalize_named_ops.mlir" + "gpu_pipeline_igemm.mlir" "gpu_set_num_workgroups.mlir" "illegal_configuration.mlir" "legalize.mlir" diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_igemm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_igemm.mlir new file mode 100644 index 000000000000..083e29ab74a1 --- /dev/null +++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_igemm.mlir @@ -0,0 +1,69 @@ +// RUN: iree-opt --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" \ +// RUN: --iree-codegen-llvmgpu-use-igemm --iree-gpu-test-target=gfx940 --split-input-file %s | FileCheck %s + +// RUN: iree-opt --pass-pipeline="builtin.module(iree-codegen-rocdl-configuration-pipeline)" \ +// RUN: --iree-codegen-llvmgpu-use-igemm --iree-gpu-test-target=gfx940 --split-input-file %s | FileCheck %s + +// Make sure that the GPU configuration pipelines set correct translation info for igemm. + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding +]> + +func.func @nhwc_conv() { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant dense<1.0> : tensor<1x64xf32> + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x16x16x4xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<3x3x4x16xf32> + %empty = tensor.empty() : tensor<1x14x14x16xf32> + %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<1x14x14x16xf32>) -> tensor<1x14x14x16xf32> + %5 = linalg.conv_2d_nhwc_hwcf + {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> } + ins(%3, %4: tensor<1x16x16x4xf32>, tensor<3x3x4x16xf32>) + outs(%fill: tensor<1x14x14x16xf32>) -> tensor<1x14x14x16xf32> + flow.dispatch.tensor.store %5, %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 16], strides = [1, 1, 1, 1] : tensor<1x14x14x16xf32> -> !flow.dispatch.tensor> + return +} +// CHECK: #[[$TRANSLATION_INFO:.+]] = #iree_codegen.translation_info +// CHECK-SAME: LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64 +// CHECK-LABEL: func.func @nhwc_conv +// CHECK-SAME: translation_info = #[[$TRANSLATION_INFO]] +// CHECK: iree_linalg_ext.im2col + +// ----- + +#pipeline_layout = #hal.pipeline.layout, + #hal.pipeline.binding, + #hal.pipeline.binding +]> + +func.func @nchw_conv() { + %cst = arith.constant 0.000000e+00 : f32 + %cst_0 = arith.constant dense<1.0> : tensor<1x64xf32> + %c0 = arith.constant 0 : index + %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor> + %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor> + %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor> + %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 4, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<1x4x16x16xf32> + %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [16, 4, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor> -> tensor<16x4x3x3xf32> + %empty = tensor.empty() : tensor<1x16x14x14xf32> + %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<1x16x14x14xf32>) -> tensor<1x16x14x14xf32> + %5 = linalg.conv_2d_nchw_fchw + {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> } + ins(%3, %4: tensor<1x4x16x16xf32>, tensor<16x4x3x3xf32>) + outs(%fill: tensor<1x16x14x14xf32>) -> tensor<1x16x14x14xf32> + flow.dispatch.tensor.store %5, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 14, 14], strides = [1, 1, 1, 1] : tensor<1x16x14x14xf32> -> !flow.dispatch.tensor> + return +} +// CHECK: #[[$TRANSLATION_INFO:.+]] = #iree_codegen.translation_info +// CHECK-SAME: LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64 +// CHECK-LABEL: func.func @nchw_conv +// CHECK-SAME: translation_info = #[[$TRANSLATION_INFO]] +// CHECK: iree_linalg_ext.im2col