diff --git a/compiler/src/iree/compiler/Codegen/Common/ConvolutionToIGEMM.cpp b/compiler/src/iree/compiler/Codegen/Common/ConvolutionToIGEMM.cpp
index 20c2e5ff1990..d3ceb7ea4271 100644
--- a/compiler/src/iree/compiler/Codegen/Common/ConvolutionToIGEMM.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/ConvolutionToIGEMM.cpp
@@ -41,7 +41,11 @@ struct SetIGEMMConfiguration final : OpRewritePattern<linalg::GenericOp> {
     auto im2colOp =
         genericOp.getOperand(0).getDefiningOp<IREE::LinalgExt::Im2colOp>();
     if (!im2colOp) {
-      return rewriter.notifyMatchFailure(genericOp, "no im2colOp producer.");
+      im2colOp =
+          genericOp.getOperand(1).getDefiningOp<IREE::LinalgExt::Im2colOp>();
+      if (!im2colOp) {
+        return rewriter.notifyMatchFailure(genericOp, "no im2colOp producer.");
+      }
     }
 
     if (getLoweringConfig(genericOp)) {
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
index 14264d650fd4..25bcdea14568 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/Passes.cpp
@@ -1206,6 +1206,9 @@ static void buildROCDLCodegenConfigurationPassPipelineImpl(
     OpPassManager &modulePassManager) {
   {
     FunctionLikeNest funcPassManager(modulePassManager);
+    funcPassManager.addPredicatedPass(clLLVMGPUUseIgemm, []() {
+      return createConvolutionToIGEMMPass(igemmConfigFn);
+    });
     funcPassManager.addPass(createGPUGeneralizeNamedOpsPass);
     addCommonTargetExecutablePreprocessingPasses(funcPassManager);
   }
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
index 925106ab4de0..245a533ded69 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/BUILD.bazel
@@ -38,6 +38,7 @@ iree_lit_test_suite(
             "extract_address_computation_gpu.mlir",
             "gpu_set_num_workgroups.mlir",
             "gpu_pipeline_generalize_named_ops.mlir",
+            "gpu_pipeline_igemm.mlir",
             "nvvm_extract_address_computation.mlir",
             "nvvm_pipeline_test.mlir",
             "nvvm_mma_sync_pipeline_test.mlir",
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
index 0c279642cb5c..c59d89cf3f19 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/CMakeLists.txt
@@ -35,6 +35,7 @@ iree_lit_test_suite(
     "elementwise_pipeline.mlir"
     "extract_address_computation_gpu.mlir"
     "gpu_pipeline_generalize_named_ops.mlir"
+    "gpu_pipeline_igemm.mlir"
     "gpu_set_num_workgroups.mlir"
     "illegal_configuration.mlir"
     "legalize.mlir"
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_igemm.mlir b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_igemm.mlir
new file mode 100644
index 000000000000..083e29ab74a1
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/test/gpu_pipeline_igemm.mlir
@@ -0,0 +1,69 @@
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-codegen-llvmgpu-configuration-pipeline)" \
+// RUN:   --iree-codegen-llvmgpu-use-igemm --iree-gpu-test-target=gfx940 --split-input-file %s | FileCheck %s
+
+// RUN: iree-opt --pass-pipeline="builtin.module(iree-codegen-rocdl-configuration-pipeline)" \
+// RUN:   --iree-codegen-llvmgpu-use-igemm --iree-gpu-test-target=gfx940 --split-input-file %s | FileCheck %s
+
+// Make sure that the GPU configuration pipelines set correct translation info for igemm.
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
+  #hal.pipeline.binding<storage_buffer, ReadOnly>,
+  #hal.pipeline.binding<storage_buffer, Indirect>
+]>
+
+func.func @nhwc_conv() {
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst_0 = arith.constant dense<1.0> : tensor<1x64xf32>
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x16x16x4xf32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<3x3x4x16xf32>>
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<1x14x14x16xf32>>
+  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 16, 16, 4], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x16x16x4xf32>> -> tensor<1x16x16x4xf32>
+  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [3, 3, 4, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<3x3x4x16xf32>> -> tensor<3x3x4x16xf32>
+  %empty = tensor.empty() : tensor<1x14x14x16xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<1x14x14x16xf32>) -> tensor<1x14x14x16xf32>
+  %5 = linalg.conv_2d_nhwc_hwcf
+    {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
+     ins(%3, %4: tensor<1x16x16x4xf32>, tensor<3x3x4x16xf32>)
+    outs(%fill: tensor<1x14x14x16xf32>) -> tensor<1x14x14x16xf32>
+  flow.dispatch.tensor.store %5, %2, offsets = [0, 0, 0, 0], sizes = [1, 14, 14, 16], strides = [1, 1, 1, 1] : tensor<1x14x14x16xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x14x14x16xf32>>
+  return
+}
+// CHECK:       #[[$TRANSLATION_INFO:.+]] = #iree_codegen.translation_info
+// CHECK-SAME:      LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
+// CHECK-LABEL: func.func @nhwc_conv
+// CHECK-SAME:      translation_info = #[[$TRANSLATION_INFO]]
+// CHECK:         iree_linalg_ext.im2col
+
+// -----
+
+#pipeline_layout = #hal.pipeline.layout<bindings = [
+  #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">,
+  #hal.pipeline.binding<storage_buffer, ReadOnly>,
+  #hal.pipeline.binding<storage_buffer, Indirect>
+]>
+
+func.func @nchw_conv() {
+  %cst = arith.constant 0.000000e+00 : f32
+  %cst_0 = arith.constant dense<1.0> : tensor<1x64xf32>
+  %c0 = arith.constant 0 : index
+  %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1x4x16x16xf32>>
+  %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags(ReadOnly) : !flow.dispatch.tensor<readonly:tensor<16x4x3x3xf32>>
+  %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<1x16x14x14xf32>>
+  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0, 0], sizes = [1, 4, 16, 16], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<1x4x16x16xf32>> -> tensor<1x4x16x16xf32>
+  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [16, 4, 3, 3], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<16x4x3x3xf32>> -> tensor<16x4x3x3xf32>
+  %empty = tensor.empty() : tensor<1x16x14x14xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<1x16x14x14xf32>) -> tensor<1x16x14x14xf32>
+  %5 = linalg.conv_2d_nchw_fchw
+    {dilations = dense<1> : tensor<2xi64>, strides = dense<1> : tensor<2xi64> }
+     ins(%3, %4: tensor<1x4x16x16xf32>, tensor<16x4x3x3xf32>)
+    outs(%fill: tensor<1x16x14x14xf32>) -> tensor<1x16x14x14xf32>
+  flow.dispatch.tensor.store %5, %2, offsets = [0, 0, 0, 0], sizes = [1, 16, 14, 14], strides = [1, 1, 1, 1] : tensor<1x16x14x14xf32> -> !flow.dispatch.tensor<writeonly:tensor<1x16x14x14xf32>>
+  return
+}
+// CHECK:       #[[$TRANSLATION_INFO:.+]] = #iree_codegen.translation_info
+// CHECK-SAME:      LLVMGPUTileAndFuse workgroup_size = [64, 1, 1] subgroup_size = 64
+// CHECK-LABEL: func.func @nchw_conv
+// CHECK-SAME:      translation_info = #[[$TRANSLATION_INFO]]
+// CHECK:         iree_linalg_ext.im2col