From 437611752055a0f3af168a8d20f7e35979927460 Mon Sep 17 00:00:00 2001
From: Kunwar Grover <groverkss@gmail.com>
Date: Tue, 29 Oct 2024 16:59:36 +0000
Subject: [PATCH] [GPU] Do not treat pad as a tilable producer for operand
 promotion (#18918)

PadOp doesn't have an implementation for deriving thread configuration
from derived_thread_config, so ignore promoting it until an
implementation is added.
---
 .../Common/GPU/GPUPromoteMatmulOperands.cpp   | 12 +++++++---
 .../GPU/test/gpu_promote_matmul_operands.mlir | 24 +++++++++++++++++++
 2 files changed, 33 insertions(+), 3 deletions(-)
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
index dd498fad50e8..5e50a956bd82 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/GPUPromoteMatmulOperands.cpp
@@ -53,9 +53,15 @@ void promoteOperand(OpBuilder &builder, Operation *op, unsigned index) {
         return;
       }
     }
-    setLoweringConfig(producer, IREE::GPU::DerivedThreadConfigAttr::get(
-                                    builder.getContext()));
-    return;
+
+    // We only support thread tile size derivation of linalgOp and Im2colOp for
+    // now.
+    if (isa<linalg::LinalgOp, IREE::LinalgExt::Im2colOp>(
+            producer.getOperation())) {
+      setLoweringConfig(producer, IREE::GPU::DerivedThreadConfigAttr::get(
+                                      builder.getContext()));
+      return;
+    }
   }
 
   auto tensorType = dyn_cast<RankedTensorType>(operand.getType());
diff --git a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
index f05cf7b1890b..643b12c01e39 100644
--- a/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
+++ b/compiler/src/iree/compiler/Codegen/Common/GPU/test/gpu_promote_matmul_operands.mlir
@@ -82,3 +82,27 @@ func.func @no_promote_fill(%b: tensor<128x128xf32>) -> tensor<4x128xf32> {
 // CHECK-LABEL: func.func @no_promote_fill
 //   CHECK-NOT:   iree_gpu.derived_thread_config
 //       CHECK: return
+
+// -----
+
+#lowering_config = #iree_gpu.lowering_config<{promote_operands = [0]}>
+
+func.func @promote_pad(%a : tensor<4x127xf32>, %b: tensor<128x128xf32>) -> tensor<4x128xf32> {
+  %cst = arith.constant 0.000000e+00 : f32
+  %empty = tensor.empty() : tensor<4x128xf32>
+  %fill = linalg.fill ins(%cst : f32) outs(%empty : tensor<4x128xf32>) -> tensor<4x128xf32>
+  %padded = tensor.pad %a low[0, 0] high[0, 1] {
+  ^bb0(%arg0: index, %arg1: index):
+    tensor.yield %cst : f32
+  } : tensor<4x127xf32> to tensor<4x128xf32>
+  %mm = linalg.matmul {lowering_config = #lowering_config}
+    ins(%padded, %b : tensor<4x128xf32>, tensor<128x128xf32>) outs(%fill : tensor<4x128xf32>) -> tensor<4x128xf32>
+  return %mm : tensor<4x128xf32>
+}
+
+// Verify that pad is promoted with linalg.copy
+// CHECK-LABEL: func.func @promote_pad
+//   CHECK:   tensor.pad
+//   CHECK:   linalg.copy
+// CHECK-SAME: derived_thread_config
+//       CHECK: return