From 1ce90893dda91c46a4171059e22cc2132baf71a5 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Thu, 6 Jul 2023 02:04:01 -0400
Subject: [PATCH 01/12] [SPIRV] Add transform dialect JIT mode

Adds the ability to use the transform dialect strategy builders behind
`iree-spirv-enable-transform-dialect-jit`, mirroring the existing flags
for LLVMCPU/GPU.
---
 .../compiler/Codegen/LLVMGPU/KernelConfig.cpp | 24 ++++++
 .../TransformExtensions/LLVMGPUExtensions.cpp |  5 +-
 .../LLVMGPUExtensionsOps.td                   |  2 +-
 .../compiler/Codegen/SPIRV/KernelConfig.cpp   | 78 +++++++++++++++++--
 .../GPU/AbstractGemmLikeStrategy.cpp          | 41 +++++-----
 .../GPU/AbstractGemmLikeStrategy.h            |  8 ++
 .../TransformStrategies/GPU/Common.cpp        |  7 +-
 .../Codegen/TransformStrategies/GPU/Common.h  |  7 +-
 .../GPU/MatmulTensorCoreStrategy.cpp          | 30 ++++++-
 .../TransformStrategies/GPU/PadStrategy.h     |  2 +-
 .../TransformStrategies/GPU/Strategies.h      | 15 ++++
 11 files changed, 183 insertions(+), 36 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
index 17189e927bfb..bd192b3cbb5c 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/KernelConfig.cpp
@@ -692,6 +692,30 @@ static LogicalResult setTransformDialectConfig(func::FuncOp entryPoint,
   gpuModel.hasTF32TensorCore = targetInfo.hasTF32TensorCore;
   gpuModel.hasMmaSync = targetInfo.hasMmaSync;
 
+  // Populates a subset of the fragment combinations supported in MLIR lowerings
+  // to NVVM (which is itself a subset of what LLVM supports) based on what the
+  // pipeline currently supports.
+  // TODO: avoid hard coding this and populate based on hardware capabilities.
+  // TODO: add missing supported configs once the pipeline supports it.
+  MLIRContext *context = entryPoint.getContext();
+  Type f32Type = Float32Type::get(context);
+  Type f16Type = Float16Type::get(context);
+
+  iree_compiler::gpu::MMAConfig f16f32AccConfig = {
+      /*m=*/16,          /*n=*/16,          /*k=*/16,
+      /*aType=*/f16Type, /*bType=*/f16Type, /*cType=*/f32Type};
+  iree_compiler::gpu::MMAConfig f16f16AccConfig = {
+      /*m=*/16,          /*n=*/16,          /*k=*/16,
+      /*aType=*/f16Type, /*bType=*/f16Type, /*cType=*/f16Type};
+  gpuModel.supportedWMMAConfigs = {f16f32AccConfig, f16f16AccConfig};
+
+  if (targetInfo.hasTF32TensorCore) {
+    iree_compiler::gpu::MMAConfig tf32WmmaConfig = {
+        /*m=*/16,          /*n=*/16,          /*k=*/8,
+        /*aType=*/f32Type, /*bType=*/f32Type, /*cType=*/f32Type};
+    gpuModel.supportedWMMAConfigs.push_back(tf32WmmaConfig);
+  }
+
   if (failed(iree_compiler::gpu::matchAndSetTransformStrategy(entryPoint, op,
                                                               gpuModel)))
     return failure();
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
index f101d90d2993..5c6b2f747682 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.cpp
@@ -126,9 +126,10 @@ void transform_dialect::MapNestedForallToGpuThreadsOp::build(
 void transform_dialect::MapNestedForallToGpuThreadsOp::build(
     OpBuilder &builder, OperationState &state, Value target,
     ArrayRef<int64_t> workgroupDims, ArrayRef<int64_t> warpDims,
-    int64_t subgroupSize) {
+    std::optional<int64_t> subgroupSize) {
   build(builder, state, {}, target, workgroupDims, warpDims,
-        builder.getI64IntegerAttr(subgroupSize));
+        subgroupSize ? builder.getI64IntegerAttr(*subgroupSize)
+                     : IntegerAttr());
 }
 
 void transform_dialect::MapNestedForallToGpuThreadsOp::getEffects(
diff --git a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
index 8956a6ba77ab..ce43cd560068 100644
--- a/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensionsOps.td
@@ -110,7 +110,7 @@ def MapNestedForallToGpuThreadsOp :
     OpBuilder<(ins "Value":$target,
                    "ArrayRef<int64_t>":$workgroup_dims,
                    "ArrayRef<int64_t>":$warp_dims,
-                   "int64_t":$subgroupSize)>
+                   "std::optional<int64_t>":$subgroupSize)>
   ];
   let extraClassDeclaration = [{
     ::mlir::DiagnosedSilenceableFailure applyToOne(
diff --git a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
index fd97fe5c4f9a..b81d4c9a584a 100644
--- a/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
+++ b/compiler/src/iree/compiler/Codegen/SPIRV/KernelConfig.cpp
@@ -13,6 +13,7 @@
 #include "iree/compiler/Codegen/Common/UserConfig.h"
 #include "iree/compiler/Codegen/Dialect/IREECodegenAttrs.h"
 #include "iree/compiler/Codegen/SPIRV/Utils.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
 #include "iree/compiler/Codegen/Utils/GPUUtils.h"
 #include "iree/compiler/Dialect/Flow/IR/FlowOps.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -51,6 +52,11 @@ llvm::cl::opt<std::string> clSPIRVTransformDialectFileName(
         "MLIR file containing a transform dialect specification to apply"),
     llvm::cl::init(""));
 
+llvm::cl::opt<bool> clSPIRVEnableTransformDialectJit(
+    "iree-spirv-enable-transform-dialect-jit",
+    llvm::cl::desc("enable the usage of the transform dialect JIT"),
+    llvm::cl::init(false));
+
 using CodeGenPipeline = IREE::Codegen::DispatchLoweringPassPipeline;
 
 //===----------------------------------------------------------------------===//
@@ -1515,6 +1521,68 @@ static LogicalResult setDefaultOpConfig(spirv::ResourceLimitsAttr limits,
                                                workgroupSize);
 }
 
+//===----------------------------------------------------------------------===//
+// Transform Dialect Specialized Configurations
+//===----------------------------------------------------------------------===//
+
+static LogicalResult
+setTransformDialectConfig(func::FuncOp entryPoint, Operation *op,
+                          const spirv::TargetEnv &targetEnv) {
+  if (!clSPIRVEnableTransformDialectJit &&
+      clSPIRVTransformDialectFileName.empty()) {
+    return failure();
+  }
+
+  MLIRContext *context = entryPoint.getContext();
+
+  // Prefer a transform script file if provided.
+  if (!clSPIRVTransformDialectFileName.empty()) {
+    auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
+        context, CodeGenPipeline::TransformDialectCodegen);
+    LLVM_DEBUG(llvm::dbgs() << "using user specified transform dialect...\n");
+    return setTranslationInfo(entryPoint, translationInfo);
+  }
+
+  auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
+      entryPoint.getContext(),
+      IREE::Codegen::DispatchLoweringPassPipeline::TransformDialectCodegen);
+  if (!clSPIRVTransformDialectFileName.empty()) {
+    return setTranslationInfo(entryPoint, translationInfo);
+  }
+
+  spirv::ResourceLimitsAttr limits = targetEnv.getResourceLimits();
+
+  // TODO: unify the target informations into one structure.
+  iree_compiler::gpu::GPUModel gpuModel;
+  gpuModel.hasWarpShuffle =
+      targetEnv.allows(spirv::Capability::GroupNonUniformShuffle);
+  gpuModel.hasTF32TensorCore = false;
+  gpuModel.hasMmaSync = false;
+  gpuModel.minSubgroupSize = limits.getMinSubgroupSize();
+  gpuModel.maxSubgroupSize = limits.getMaxSubgroupSize();
+  gpuModel.maxWorkGroupInvocations = limits.getMaxComputeWorkgroupInvocations();
+
+  // Populates the supported WMMA fragment combinations from the target
+  // environment. Infer tf32 support from the list of supported fragment types.
+  Type f32Type = Float32Type::get(context);
+  auto properties = limits.getCooperativeMatrixPropertiesNv()
+                        .getAsRange<spirv::CooperativeMatrixPropertiesNVAttr>();
+  for (auto property : properties) {
+    if (property.getScope().getValue() != spirv::Scope::Subgroup)
+      continue;
+    gpuModel.supportedWMMAConfigs.push_back(iree_compiler::gpu::MMAConfig{
+        property.getMSize(), property.getNSize(), property.getKSize(),
+        property.getAType(), property.getBType(), property.getCType()});
+    if (property.getAType() == f32Type && property.getBType() == f32Type)
+      gpuModel.hasTF32TensorCore = true;
+  }
+
+  if (failed(iree_compiler::gpu::matchAndSetTransformStrategy(entryPoint, op,
+                                                              gpuModel)))
+    return failure();
+  return setTranslationInfo(entryPoint, translationInfo);
+}
+
 //===----------------------------------------------------------------------===//
 // Configuration Dispatcher
 //===----------------------------------------------------------------------===//
@@ -1531,13 +1599,9 @@ static LogicalResult setSPIRVOpConfig(const spirv::TargetEnv &targetEnv,
     return setUserConfig(entryPointFn, rootOp, compilationInfo);
   }
 
-  if (!clSPIRVTransformDialectFileName.empty()) {
-    MLIRContext *context = entryPointFn.getContext();
-    auto translationInfo = IREE::Codegen::TranslationInfoAttr::get(
-        context, CodeGenPipeline::TransformDialectCodegen);
-    LLVM_DEBUG(llvm::dbgs() << "using user specified transform dialect...\n");
-
-    return setTranslationInfo(entryPointFn, translationInfo);
+  // First try to see if there is a matching transform dialect configuration.
+  if (succeeded(setTransformDialectConfig(entryPointFn, rootOp, targetEnv))) {
+    return success();
   }
 
   // First try to find a proper CodeGen configuration to tile and vectorize for
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.cpp
index 8787c6359ddc..bc992ddeb712 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.cpp
@@ -91,6 +91,26 @@ void AbstractGemmLikeStrategy::initDefaultValues(const GPUModel &gpuModel) {
     cliOptionsSpecified = true;
   }
 
+  /// If not specified, select instructions to target for compute.
+  if (!useMmaSync && !useWmma && !useFma) {
+    /// First, try to use tensor core.
+    if (getLhsElementalType() == getRhsElementalType()) {
+      /// Currently all supported targets at least have WMMA.
+      /// TODO: Handle targets without tensor core.
+      if (gpuModel.hasMmaSync)
+        useMmaSync = true;
+      else
+        useWmma = true;
+    } else {
+      /// Mixed precision only supported by fma.
+      useFma = true;
+    }
+  }
+
+  /// Prefer smaller subgroup sizes for tensor core strategies.
+  if (!useFma)
+    targetSubgroupSize = gpuModel.minSubgroupSize;
+
   /// Default configuration based on hardware properties and problem bit widths.
   if (clBlockTileSizes.getNumOccurrences()) {
     blockTileSizes =
@@ -105,7 +125,7 @@ void AbstractGemmLikeStrategy::initDefaultValues(const GPUModel &gpuModel) {
     // Infer from warp counts if present.
     if (clNumWarps.getNumOccurrences()) {
       numThreads = SmallVector<int64_t>(clNumWarps.begin(), clNumWarps.end());
-      numThreads[0] *= gpuModel.subgroupSize;
+      numThreads[0] *= getSubgroupSize();
     } else {
       numThreads = SmallVector<int64_t>{64, 2, 1};
     }
@@ -114,7 +134,7 @@ void AbstractGemmLikeStrategy::initDefaultValues(const GPUModel &gpuModel) {
     numWarps = SmallVector<int64_t>(clNumWarps.begin(), clNumWarps.end());
   } else {
     numWarps = numThreads;
-    numWarps[0] = mlir::ceilDiv(numWarps[0], gpuModel.subgroupSize);
+    numWarps[0] = mlir::ceilDiv(numWarps[0], getSubgroupSize());
   }
   if (clUseAsyncCopies.getNumOccurrences())
     useAsyncCopies = clUseAsyncCopies;
@@ -126,21 +146,6 @@ void AbstractGemmLikeStrategy::initDefaultValues(const GPUModel &gpuModel) {
     useWmma = clUseWmma;
   if (clUseFma.getNumOccurrences())
     useFma = clUseFma;
-  /// If not specified, select instructions to target for compute.
-  if (!useMmaSync && !useWmma && !useFma) {
-    /// First, try to use tensor core.
-    if (getLhsElementalType() == getRhsElementalType()) {
-      /// Currently all supported targets at least have WMMA.
-      /// TODO: Handle targets without tensor core.
-      if (gpuModel.hasMmaSync)
-        useMmaSync = true;
-      else
-        useWmma = true;
-    } else {
-      /// Mixed precision only supported by fma.
-      useFma = true;
-    }
-  }
   if (clReductionTileSize.getNumOccurrences()) {
     reductionTileSize = clReductionTileSize;
   } else {
@@ -175,7 +180,7 @@ AbstractGemmLikeStrategy::getZeroPadAttrFromElementalTypes(OpBuilder &b) const {
 
 LogicalResult
 AbstractGemmLikeStrategy::validate(const GPUModel &gpuModel) const {
-  if (totalNumThreads() != totalNumWarps() * gpuModel.subgroupSize) {
+  if (totalNumThreads() != totalNumWarps() * getSubgroupSize()) {
     llvm::errs() << "Number of threads specified by warps must match total "
                     "number of threads\n";
     return failure();
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h
index 8a3e5e40b917..d9557b95dee5 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h
@@ -38,6 +38,14 @@ struct AbstractGemmLikeStrategy : GPUStrategy {
   /// override the user's choices.
   bool cliOptionsSpecified = false;
 
+  /// Non-default subgroup size to use configured based on hardware supported
+  /// values.
+  std::optional<int64_t> targetSubgroupSize = std::nullopt;
+
+  int64_t getSubgroupSize() const {
+    return targetSubgroupSize ? *targetSubgroupSize : subgroupSize;
+  }
+
   //===--------------------------------------------------------------------===//
   // Parameters that control the tiling and mapping.
   //===--------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
index 3f006c68556c..6d4bc83cb27e 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
@@ -142,9 +142,12 @@ static std::pair<int64_t, int64_t> computeSplitPoint(int64_t upperBound,
 /// func.func.
 Value mlir::iree_compiler::gpu::buildMapToBlockAndThreads(
     ImplicitLocOpBuilder &b, Value funcH, ArrayRef<int64_t> blockSize,
-    ArrayRef<int64_t> warpDims) {
+    ArrayRef<int64_t> warpDims, std::optional<int64_t> subgroupSize) {
   b.create<ForallToWorkgroupOp>(funcH);
-  b.create<MapNestedForallToGpuThreadsOp>(funcH, blockSize, warpDims);
+  auto mapToThreadsOp =
+      b.create<MapNestedForallToGpuThreadsOp>(funcH, blockSize, warpDims);
+  if (subgroupSize)
+    mapToThreadsOp.setSubgroupSize(*subgroupSize);
   return funcH;
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.h
index 1a6746aab0ba..cdf43771d3f5 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.h
@@ -73,9 +73,10 @@ int64_t adjustNumberOfWarpsForBlockShuffle(int64_t numWarpsToUse,
 /// Takes an optional `warpDims` argument to specify the number of warp
 /// dimensions to consider along various dimensions and avoid second-guessing
 /// how the mapping to warps should occur.
-Value buildMapToBlockAndThreads(ImplicitLocOpBuilder &b, Value funcH,
-                                ArrayRef<int64_t> blockSize,
-                                ArrayRef<int64_t> warpDims = {});
+Value buildMapToBlockAndThreads(
+    ImplicitLocOpBuilder &b, Value funcH, ArrayRef<int64_t> blockSize,
+    ArrayRef<int64_t> warpDims = {},
+    std::optional<int64_t> subgroupSize = std::nullopt);
 
 /// Post-bufferization vector distribution with rank-reduction.
 /// Takes a handle to a func.func and returns an updated handle to a
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.cpp
index f92f8f21631d..f8b303dde33e 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.cpp
@@ -96,6 +96,29 @@ LogicalResult MatmulStrategy::validate(const GPUModel &gpuModel) const {
     return failure();
   }
 
+  if (useMmaSync) {
+    if (!gpuModel.hasMmaSync) {
+      LDBG("--Matmul strategy target does not support MMA.SYNC operations\n");
+      return failure();
+    }
+  } else {
+    // Verify WMMA.
+    // Hard coded to reflect current WMMA unrolling support.
+    int reqM = 16;
+    int reqN = 16;
+    int reqK = lhsElementType.isF32() ? 8 : 16;
+    if (llvm::all_of(gpuModel.supportedWMMAConfigs,
+                     [&](iree_compiler::gpu::MMAConfig config) {
+                       return config.m != reqM || config.n != reqN ||
+                              config.k != reqK ||
+                              config.aType != lhsElementType ||
+                              config.bType != rhsElementType ||
+                              config.cType != resElementType;
+                     })) {
+      LDBG("--Matmul strategy failed wmma type check\n");
+      return failure();
+    }
+  }
   return success();
 }
 
@@ -238,8 +261,11 @@ buildCommonMatmulLikeThreadSchedule(ImplicitLocOpBuilder &b, Value variantH,
   // Need to match again since bufferize invalidated all handles.
   // TODO: assumes a single func::FuncOp to transform, needs hardening.
   funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
-  funcH = buildMapToBlockAndThreads(b, funcH, strategy.numThreads,
-                                    strategy.numWarps);
+  funcH =
+      buildMapToBlockAndThreads(b, funcH,
+                                /*blockSize=*/strategy.numThreads,
+                                /*warpDims=*/strategy.numWarps,
+                                /*subgroupSize=*/strategy.targetSubgroupSize);
   funcH = b.create<EliminateGpuBarriersOp>(funcH);
 
   // Step 9. Convert to tensor core ops.
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h
index 8c3e0dab5e1e..cba7da3dde43 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h
@@ -20,7 +20,7 @@ namespace gpu {
 struct PadConfig {};
 
 /// Simple padding strategy.
-class PadStrategy : GPUStrategy {
+class PadStrategy : public GPUStrategy {
 public:
   PadStrategy(MLIRContext *context,
               const transform_ext::MatchedPadCaptures &captures,
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
index 852d5f16dc7a..5f7e051c91ad 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
@@ -26,6 +26,18 @@ class StagedReductionStrategy;
 static constexpr int64_t kCudaWarpSize = 32;
 static constexpr int64_t kCudaMaxNumThreads = 1024;
 
+/// Placeholder for representing supported WMMA/Cooperative Matrix
+/// configurations. This is a reflection of
+/// SPIRV_CooperativeMatrixPropertiesNVArrayAttr.
+struct MMAConfig {
+  int64_t m;
+  int64_t n;
+  int64_t k;
+  Type aType;
+  Type bType;
+  Type cType;
+};
+
 /// Placeholder for some hardware model proxy that contains relevant information
 /// to configure the strategies. In the future, this will need to be
 /// driven by some contract with the runtime.
@@ -34,11 +46,14 @@ struct GPUModel {
   llvm::StringRef model = kDefaultGPU;
   /// TODO: Support a range of subgroup sizes.
   int64_t subgroupSize = kCudaWarpSize;
+  std::optional<int> minSubgroupSize = std::nullopt;
+  std::optional<int> maxSubgroupSize = std::nullopt;
   int64_t maxWorkGroupInvocations = kCudaMaxNumThreads;
   int64_t maxWorkGroupSize[3] = {1024, 1024, 64};
   bool hasWarpShuffle = false;
   bool hasTF32TensorCore = false;
   bool hasMmaSync = false;
+  SmallVector<MMAConfig> supportedWMMAConfigs = {};
 };
 
 //===--------------------------------------------------------------------===//

From 7b7e3a505ff7557d20c4fbb5d68edb307c2028d8 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Sun, 23 Apr 2023 11:33:38 -0400
Subject: [PATCH 02/12] Add convolution interface based ops to
 DetachElementwiseFromNamedOps

DetachElementwiseFromNamedOps is used to replace pre-filled outputs with
a zero-fill + add for contracting ops (gemm, conv). This extends the
pattern to the convolution interface to allow non-named cases. Renaming
of the pass can happen as a follow up if/when this is upstreamed.
---
 .../Flow/Transforms/DetachElementwiseFromNamedOps.cpp        | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/DetachElementwiseFromNamedOps.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/DetachElementwiseFromNamedOps.cpp
index 9e228d0f4ccc..f3b05e96c5a6 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/DetachElementwiseFromNamedOps.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/DetachElementwiseFromNamedOps.cpp
@@ -39,7 +39,10 @@ struct DetachElementwisePattern
   LogicalResult matchAndRewrite(linalg::LinalgOp linalgOp,
                                 PatternRewriter &rewriter) const override {
     if (!linalg::isaContractionOpInterface(linalgOp) &&
-        !isa<linalg::ConvolutionOpInterface>(*linalgOp)) {
+        !isa<linalg::ConvolutionOpInterface>(*linalgOp) &&
+        !linalg::detail::getMatchConvolutionMessage(
+             mlir::linalg::detail::isConvolutionInterfaceImpl(linalgOp))
+             .empty()) {
       return failure();
     }
     if (!linalgOp.hasTensorSemantics())

From 8dd79a3a0d63b721fae89bc817216079b0c858cd Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Fri, 7 Jul 2023 17:35:54 -0400
Subject: [PATCH 03/12] [TransformMatchers] Match leading pad in convolution
 matcher

Towards pad fused convolution strategies.
---
 .../TransformStrategies/GPU/Strategies.cpp        |  6 ++++--
 .../iree-dialects/Transforms/TransformMatchers.h  |  2 ++
 .../IR/StructuredTransformOpsExt.cpp              |  7 ++++++-
 .../lib/Transforms/TransformMatchers.cpp          | 15 +++++++++++++--
 4 files changed, 25 insertions(+), 5 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
index 32b9490fafb0..ea12b1fd3472 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
@@ -614,12 +614,14 @@ static LogicalResult matchAndSetConvolutionStrategy(func::FuncOp entryPoint,
   }
 
   // 1. Match a reduction and surrounding ops.
+  CapturingOpMatcher *pad;
   StructuredOpMatcher *fill;
   StructuredOpMatcher *convolution;
   StructuredOpMatcher *trailing;
   transform_ext::MatchedConvolutionCaptures captures;
   transform_ext::MatcherContext matcherContext;
-  makeConvolutionMatcher(matcherContext, convolution, fill, trailing, captures,
+  makeConvolutionMatcher(matcherContext, convolution, pad, fill, trailing,
+                         captures,
                          /*mustMatchEntireFunc=*/true);
   if (!matchPattern(op, *convolution)) {
     LDBG("--Implicit gemm strategy fail to match\n");
@@ -631,7 +633,7 @@ static LogicalResult matchAndSetConvolutionStrategy(func::FuncOp entryPoint,
   //   - Mandatory fill op.
   //   - Require minimum tile alignment due to img2col.
   //   - Otherwise, we take it.
-  if (!fill->getCaptured() || trailing->getCaptured()) {
+  if (!fill->getCaptured() || trailing->getCaptured() || pad->getCaptured()) {
     LDBG("--Implicit gemm strategy fill / trailing preconditions failed\n");
     return failure();
   }
diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Transforms/TransformMatchers.h b/llvm-external-projects/iree-dialects/include/iree-dialects/Transforms/TransformMatchers.h
index 93249b28a322..ebcc6f9ed70f 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Transforms/TransformMatchers.h
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Transforms/TransformMatchers.h
@@ -1133,6 +1133,7 @@ void makeSoftmaxMatcher(MatcherContext &context,
 struct MatchedConvolutionCaptures {
   Type inputElementType, filterElementType, outputElementType;
   mlir::linalg::detail::ConvolutionDimensions convolutionDims = {};
+  SmallVector<int64_t> padOpSizes = {};
   SmallVector<int64_t> convolutionOpSizes = {};
   SmallVector<int64_t> trailingOpSizes = {};
   int64_t maybeTrailingOutputElementalTypeBitWidth = 0;
@@ -1149,6 +1150,7 @@ struct MatchedConvolutionCaptures {
 /// tileable operations in the functions are captured.
 void makeConvolutionMatcher(MatcherContext &context,
                             StructuredOpMatcher *&convolutionCapture,
+                            CapturingOpMatcher *&padCapture,
                             StructuredOpMatcher *&fillCapture,
                             StructuredOpMatcher *&trailingCapture,
                             MatchedConvolutionCaptures &captures,
diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp
index ed3dfd68c4ec..1391ef47c7cf 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp
@@ -696,10 +696,11 @@ convolutionCallback(transform_ext::MatchCallbackResult &res, Location loc,
            << "expected one handle to one operation";
   }
 
+  transform_ext::CapturingOpMatcher *pad;
   transform_ext::StructuredOpMatcher *pattern, *fill, *trailing;
   transform_ext::MatchedConvolutionCaptures ignore;
   transform_ext::MatcherContext matcherContext;
-  makeConvolutionMatcher(matcherContext, pattern, fill, trailing, ignore,
+  makeConvolutionMatcher(matcherContext, pattern, pad, fill, trailing, ignore,
                          /*mustMatchEntireFunc=*/true);
 
   // TODO: need a mechanism for this to go around the entire IR,
@@ -713,6 +714,9 @@ convolutionCallback(transform_ext::MatchCallbackResult &res, Location loc,
 
     // TODO: notify properly.
     LLVM_DEBUG({
+      DBGS() << "pad:\n";
+      if (pad->getCaptured())
+        DBGS() << pad->getCaptured() << "\n";
       DBGS() << "fill:\n";
       if (fill->getCaptured())
         DBGS() << fill->getCaptured() << "\n";
@@ -722,6 +726,7 @@ convolutionCallback(transform_ext::MatchCallbackResult &res, Location loc,
         DBGS() << trailing->getCaptured() << "\n";
     });
 
+    res.addPotentiallyEmptyPayloadGroup(pad->getCaptured());
     res.addPotentiallyEmptyPayloadGroup(fill->getCaptured());
     res.addPayloadGroup({pattern->getCaptured()});
     res.addPotentiallyEmptyPayloadGroup(trailing->getCaptured());
diff --git a/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp b/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp
index 6ed12a83110e..7be58f0f71b3 100644
--- a/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp
@@ -1704,6 +1704,7 @@ void transform_ext::makeSoftmaxMatcher(
 void transform_ext::makeConvolutionMatcher(
     transform_ext::MatcherContext &matcherContext,
     transform_ext::StructuredOpMatcher *&convolutionCapture,
+    transform_ext::CapturingOpMatcher *&padCapture,
     transform_ext::StructuredOpMatcher *&fillCapture,
     transform_ext::StructuredOpMatcher *&trailingCapture,
     MatchedConvolutionCaptures &captures, bool mustMatchEntireFunc) {
@@ -1721,6 +1722,15 @@ void transform_ext::makeConvolutionMatcher(
           .output(0, CaptureElementType(captures.outputElementType));
   convolutionCapture = &convolution;
 
+  auto &value = transform_ext::m_ShapedValue(matcherContext);
+  value.dim(transform_ext::AllDims(),
+            transform_ext::CaptureDims(captures.padOpSizes));
+  auto &pad = transform_ext::m_tensorPad(matcherContext)
+                  .result(0, value)
+                  .yieldsExternalValue();
+  convolution = convolution.input(0, pad, OptionalMatch());
+  padCapture = &pad;
+
   // Optional FillOp to create the unique output of the convolution.
   auto &fill = m_StructuredOp<linalg::FillOp>(matcherContext)
                    .output(0, CaptureElementTypeBitWidth(
@@ -1757,10 +1767,11 @@ void transform_ext::makeConvolutionMatcher(
     transform_ext::MatcherContext &context,
     StructuredOpMatcher *&convolutionCapture,
     MatchedConvolutionCaptures &captures, bool mustMatchEntireFunc) {
+  CapturingOpMatcher *pad;
   StructuredOpMatcher *fill;
   StructuredOpMatcher *trailing;
-  makeConvolutionMatcher(context, convolutionCapture, fill, trailing, captures,
-                         mustMatchEntireFunc);
+  makeConvolutionMatcher(context, convolutionCapture, pad, fill, trailing,
+                         captures, mustMatchEntireFunc);
 }
 
 void transform_ext::makePadMatcher(MatcherContext &context,

From b6010a59a99fafbb1b37cbfc6a09155df3e9dafd Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Mon, 10 Jul 2023 04:49:58 -0400
Subject: [PATCH 04/12] [TransformMatchers] Enable matching generic
 convolutions

Removes the restriction for named ops only on the convolution matcher,
instead using the interface.
---
 .../iree-dialects/lib/Transforms/TransformMatchers.cpp        | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp b/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp
index 7be58f0f71b3..322bd4d08aed 100644
--- a/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp
@@ -1710,8 +1710,8 @@ void transform_ext::makeConvolutionMatcher(
     MatchedConvolutionCaptures &captures, bool mustMatchEntireFunc) {
   // The core part of the matcher is anchored on a particular convolution op.
   auto &convolution =
-      m_StructuredOp<linalg::Conv2DNchwFchwOp, linalg::Conv2DNhwcHwcfOp>(
-          matcherContext)
+      m_StructuredOp<linalg::Conv2DNchwFchwOp, linalg::Conv2DNhwcHwcfOp,
+                     linalg::GenericOp>(matcherContext)
           // Capture convolution dim classifications.
           .convolutionDims(CaptureConvDims(captures.convolutionDims))
           // Capture op sizes.

From c725a07cab43991aac436c847457a50307cfaaba Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Mon, 10 Jul 2023 00:15:00 -0400
Subject: [PATCH 05/12] [TransformStrategies] Add support for data tiled
 tensorcore conv

Adds a builder for mapping data tiled convolutions to a direct
tensorcore approach (mainly targeting wmma for now). This generates
a loop over the input channels, promotion of the padded input tile to
shared memory, and then two more inner loops over the convolution
filter.
---
 .../GPU/AbstractGemmLikeStrategy.h            |   9 +-
 .../TransformStrategies/GPU/BUILD.bazel       |   2 +
 .../TransformStrategies/GPU/CMakeLists.txt    |   2 +
 .../TransformStrategies/GPU/Common.cpp        |  34 ++-
 .../GPU/ConvolutionTensorCoreStrategy.cpp     | 237 ++++++++++++++++++
 .../GPU/ConvolutionTensorCoreStrategy.h       | 208 +++++++++++++++
 .../TransformStrategies/GPU/Strategies.cpp    | 128 +++++++++-
 .../TransformStrategies/GPU/Strategies.h      |  12 +-
 8 files changed, 608 insertions(+), 24 deletions(-)
 create mode 100644 compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp
 create mode 100644 compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h

diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h
index d9557b95dee5..691c1f656068 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h
@@ -102,16 +102,19 @@ struct AbstractGemmLikeStrategy : GPUStrategy {
     return getResElementalType().getIntOrFloatBitWidth();
   }
 
-  bool alignedLhs() const {
+  virtual bool alignedLhs() const {
     return m() % blockTileM() == 0 && k() % reductionTileSize == 0;
   }
-  bool alignedRhs() const {
+  virtual bool alignedRhs() const {
     return n() % blockTileN() == 0 && k() % reductionTileSize == 0;
   }
-  bool alignedRes() const {
+  virtual bool alignedRes() const {
     return m() % blockTileM() == 0 && n() % blockTileN() == 0;
   }
 
+  virtual bool hasLhsCopy() const { return true; }
+  virtual bool hasRhsCopy() const { return true; }
+
   virtual MappingInfo lhsCopyMapping() const = 0;
   virtual LogicalResult validateLhsCopyMapping() const = 0;
   virtual MappingInfo rhsCopyMapping() const = 0;
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel
index 1ed72bde857e..d2e3bba16743 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel
@@ -18,6 +18,7 @@ iree_compiler_cc_library(
         "AbstractGemmLikeStrategy.cpp",
         "Common.cpp",
         "ConvolutionImplicitGemmStrategy.cpp",
+        "ConvolutionTensorCoreStrategy.cpp",
         "CopyMapping.cpp",
         "MappingInfo.cpp",
         "MatmulTensorCoreStrategy.cpp",
@@ -30,6 +31,7 @@ iree_compiler_cc_library(
         "AbstractGemmLikeStrategy.h",
         "Common.h",
         "ConvolutionImplicitGemmStrategy.h",
+        "ConvolutionTensorCoreStrategy.h",
         "CopyMapping.h",
         "MappingInfo.h",
         "MatmulTensorCoreStrategy.h",
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt
index 6eef68d20ad2..3ae45c54c59f 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt
@@ -17,6 +17,7 @@ iree_cc_library(
     "AbstractGemmLikeStrategy.h"
     "Common.h"
     "ConvolutionImplicitGemmStrategy.h"
+    "ConvolutionTensorCoreStrategy.h"
     "CopyMapping.h"
     "MappingInfo.h"
     "MatmulTensorCoreStrategy.h"
@@ -28,6 +29,7 @@ iree_cc_library(
     "AbstractGemmLikeStrategy.cpp"
     "Common.cpp"
     "ConvolutionImplicitGemmStrategy.cpp"
+    "ConvolutionTensorCoreStrategy.cpp"
     "CopyMapping.cpp"
     "MappingInfo.cpp"
     "MatmulTensorCoreStrategy.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
index 6d4bc83cb27e..0e1a705f227a 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
@@ -443,22 +443,30 @@ mlir::iree_compiler::gpu::buildDistributeMatmulCopies(
       paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(1));
 
   // Rewrite aligned pads as destination passing (linalg.copy)
-  if (strategy.alignedLhs() && strategy.packingDimensions[0])
+  if (strategy.alignedLhs() && strategy.packingDimensions[0] &&
+      strategy.hasLhsCopy())
     lhsH = b.create<RewriteInDestinationPassingStyleOp>(lhsH.getType(), lhsH);
-  if (strategy.alignedRhs() && strategy.packingDimensions[1])
+  if (strategy.alignedRhs() && strategy.packingDimensions[1] &&
+      strategy.hasRhsCopy())
     rhsH = b.create<RewriteInDestinationPassingStyleOp>(rhsH.getType(), rhsH);
 
-  MappingInfo lhsCopyMapping = strategy.lhsCopyMapping();
-  Value lhsCopyOpH = buildDistributeOnePadOrCopyWithNumThreads(
-      b, variantH, lhsH, /*numThreads=*/lhsCopyMapping.numThreads,
-      /*threadDimMapping=*/lhsCopyMapping.threadMapping,
-      /*foldIfBranch=*/!strategy.alignedLhs());
-
-  MappingInfo rhsCopyMapping = strategy.rhsCopyMapping();
-  Value rhsCopyOpH = buildDistributeOnePadOrCopyWithNumThreads(
-      b, variantH, rhsH, /*numThreads=*/rhsCopyMapping.numThreads,
-      /*threadDimMapping=*/rhsCopyMapping.threadMapping,
-      /*foldIfBranch=*/!strategy.alignedRhs());
+  Value lhsCopyOpH = lhsH;
+  if (strategy.hasLhsCopy()) {
+    MappingInfo lhsCopyMapping = strategy.lhsCopyMapping();
+    lhsCopyOpH = buildDistributeOnePadOrCopyWithNumThreads(
+        b, variantH, lhsH, /*numThreads=*/lhsCopyMapping.numThreads,
+        /*threadDimMapping=*/lhsCopyMapping.threadMapping,
+        /*foldIfBranch=*/!strategy.alignedLhs());
+  }
+
+  Value rhsCopyOpH = rhsH;
+  if (strategy.hasRhsCopy()) {
+    MappingInfo rhsCopyMapping = strategy.rhsCopyMapping();
+    rhsCopyOpH = buildDistributeOnePadOrCopyWithNumThreads(
+        b, variantH, rhsH, /*numThreads=*/rhsCopyMapping.numThreads,
+        /*threadDimMapping=*/rhsCopyMapping.threadMapping,
+        /*foldIfBranch=*/!strategy.alignedRhs());
+  }
 
   if (!strategy.alignedRes()) {
     MappingInfo resCopyMapping = strategy.resCopyMapping();
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp
new file mode 100644
index 000000000000..eab23f959e76
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp
@@ -0,0 +1,237 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h"
+
+#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
+#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
+#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
+#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Transform/IR/TransformAttrs.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "iree-transform-builder"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+// TODO: significantly better namespacing.
+using iree_compiler::buildPad;
+using iree_compiler::buildSelectFirstNonEmpty;
+using iree_compiler::buildTileFuseDistToForallWithNumThreads;
+using iree_compiler::buildTileFuseDistToForallWithTileSizes;
+using iree_compiler::TileToForallAndFuseAndDistributeResult;
+using iree_compiler::gpu::buildBufferize;
+using iree_compiler::gpu::buildConvertToAsyncCopies;
+using iree_compiler::gpu::buildConvertToTensorCoreOp;
+using iree_compiler::gpu::buildDistributeMatmulCopies;
+using iree_compiler::gpu::buildHoistOutputPaddingOp;
+using iree_compiler::gpu::DataTiledConvolutionStrategy;
+using iree_compiler::gpu::MappingInfo;
+using iree_compiler::gpu::scaleUpByBitWidth;
+using iree_compiler::IREE::transform_dialect::EliminateGpuBarriersOp;
+using iree_compiler::IREE::transform_dialect::
+    IREEPopulateWorkgroupCountRegionUsingNumThreadsSliceOp;
+using transform::FuseIntoContainingOp;
+using transform::MatchOp;
+using transform_ext::RegisterMatchCallbacksOp;
+
+void DataTiledConvolutionStrategy::initDefaultValues(const GPUModel &gpuModel) {
+  // Set the configuration for padding the matmul.
+  paddingValueTypes = {captures.inputElementType, captures.filterElementType,
+                       captures.outputElementType};
+  paddingDimensions = {0, 1, 2};
+  packingDimensions = {1, 0, 1};
+
+  // Pull in tile configs from flags.
+  AbstractGemmLikeStrategy::initDefaultValues(gpuModel);
+  if (!cliOptionsSpecified) {
+    blockTileSizes[1] = 1;
+    while (m() % blockTileSizes[0]) {
+      blockTileSizes[0] /= 2;
+    }
+    useWmma = true;
+  }
+}
+
+LLVM_DUMP_METHOD void DataTiledConvolutionStrategy::dump() const {
+  print(llvm::errs());
+}
+
+void DataTiledConvolutionStrategy::print(llvm::raw_ostream &os) const {
+  os << "\n--- Data Tiled Convolution strategy ---\n";
+  AbstractGemmLikeStrategy::print(os);
+}
+
+// TODO: implement validator.
+LogicalResult
+DataTiledConvolutionStrategy::validate(const GPUModel &gpuModel) const {
+  return success();
+}
+
+static std::tuple<Value, Value, Value, Value, Value>
+buildDataTiledConvolutionStrategyBlockDistribution(
+    ImplicitLocOpBuilder &b, Value variantH,
+    const DataTiledConvolutionStrategy &strategy) {
+  // Step 1. Call the matcher. Note that this is the same matcher as used to
+  // trigger this compilation path, so it must always apply.
+  b.create<RegisterMatchCallbacksOp>();
+  auto [padH, fillH, convH, maybeTrailingH] = unpackRegisteredMatchCallback<4>(
+      b, "convolution", transform::FailurePropagationMode::Propagate, variantH);
+
+  // Step 2. Create the block/mapping tiling level and fusee.
+  auto [fusionTargetH, fusionGroupH] =
+      buildSelectFirstNonEmpty(b, maybeTrailingH, convH);
+  MappingInfo blockMapping = strategy.getBlockMapping();
+  TileToForallAndFuseAndDistributeResult tileResult =
+      buildTileFuseDistToForallWithTileSizes(
+          /*builder=*/b,
+          /*variantH=*/variantH,
+          /*rootH=*/convH,
+          /*opsToFuseH=*/fusionGroupH,
+          /*tileSizes=*/
+          getAsOpFoldResult(b.getI64ArrayAttr(blockMapping.tileSizes)),
+          /*threadDimMapping=*/
+          b.getArrayAttr(blockMapping.threadMapping));
+
+  auto [blockConvH, maybeBlockTrailingH] = buildSelectFirstNonEmpty(
+      b, tileResult.resultingFusedOpsHandles.front(), tileResult.tiledOpH);
+
+  Value fusedPadH =
+      b.create<FuseIntoContainingOp>(padH, tileResult.forallH).getFusedOp();
+  Value fusedFillH =
+      b.create<FuseIntoContainingOp>(fillH, tileResult.forallH).getFusedOp();
+
+  // Handle the workgroup count region.
+  b.create<IREEPopulateWorkgroupCountRegionUsingNumThreadsSliceOp>(
+      tileResult.forallH);
+
+  return std::make_tuple(fusedPadH, fusedFillH, blockConvH, maybeBlockTrailingH,
+                         tileResult.forallH);
+}
+
+/// Builds the common part of the schedule for matmuls and batched matmuls.
+static void buildCommonConvolutionLikeThreadSchedule(
+    ImplicitLocOpBuilder &b, Value variantH, Value padH, Value fillH,
+    Value convH, Value trailingH,
+    const DataTiledConvolutionStrategy &strategy) {
+  using mlir::iree_compiler::buildLowerVectorMasksAndCleanup;
+  using mlir::iree_compiler::buildTileFuseToScfFor;
+  using namespace mlir::iree_compiler::gpu;
+
+  // Tile the outer input channel dimension.
+  if (strategy.captures.convolutionDims.inputChannel.size() > 1) {
+    SmallVector<int64_t> tileSizes(
+        strategy.captures.convolutionDims.outputChannel.size(), 0);
+    tileSizes.append(strategy.captures.convolutionDims.outputImage.size(), 0);
+    // tileSizes.append(strategy.captures.convolutionDims.filterLoop.size(), 0);
+    tileSizes.push_back(1);
+
+    // Avoid canonicalizing before the pad to avoid folding away the
+    // extract_slice on the output needed to hoist the output pad.
+    auto tileReductionResult = buildTileFuseToScfFor(
+        b, variantH, convH, {}, getAsOpFoldResult(b.getI64ArrayAttr(tileSizes)),
+        /*canonicalize=*/false);
+    convH = tileReductionResult.tiledOpH;
+  }
+
+  // Step 2. Pad the (batch) matmul op.
+  auto paddedConvOpH = buildPad(
+      b, convH, strategy.getZeroPadAttrFromElementalTypes(b).getValue(),
+      strategy.paddingDimensions, strategy.packingDimensions);
+
+  // Step 3. Hoist the padding of the output operand above the reduction loop.
+  // The resulting fillOp will be mapped with the contraction using an SIMD
+  // programming model.
+  Value fillOpH = fillH;
+  if (!strategy.alignedRes()) {
+    fillOpH = buildHoistOutputPaddingOp(b, variantH, paddedConvOpH);
+  }
+
+  // Running canonicalization is required here to enable aligned pads to become
+  // linalg.copy ops when rewriting in DPS.
+  Value funcH =
+      b.create<transform::MatchOp>(variantH, func::FuncOp::getOperationName());
+  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
+
+  // Step 4. Distribute pad and copies: SIMT programming model.
+  // auto [lhsCopyOpH, rhsCopyOpH, copyBackOpH] =
+  buildDistributeMatmulCopies(b, variantH, paddedConvOpH, strategy);
+
+  // Step 5. Tile the filter loop dimensions.
+  SmallVector<int64_t> tileSizes(
+      strategy.captures.convolutionDims.outputChannel.size(), 0);
+  tileSizes.append(strategy.captures.convolutionDims.outputImage.size(), 0);
+  tileSizes.append(strategy.captures.convolutionDims.filterLoop.size(), 1);
+
+  auto tileReductionResult =
+      buildTileFuseToScfFor(b, variantH, paddedConvOpH, {},
+                            getAsOpFoldResult(b.getI64ArrayAttr(tileSizes)),
+                            /*canonicalize=*/true);
+  Value filterTiledConvH = tileReductionResult.tiledOpH;
+
+  // Step 6. Distribute to warps: SIMD programming model.
+  // TODO: get the number of warps from strategy.
+  MappingInfo computeMapping = strategy.computeMapping();
+  buildTileFuseDistToForallWithNumThreads(
+      b, variantH, filterTiledConvH, ValueRange(),
+      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
+      b.getArrayAttr(computeMapping.threadMapping));
+  buildTileFuseDistToForallWithNumThreads(
+      b, variantH, fillOpH, ValueRange(),
+      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
+      b.getArrayAttr(computeMapping.threadMapping));
+
+  // Step 7. Apply vectorization + cleanups to what remains.
+  funcH = iree_compiler::buildVectorize(b, funcH, /*applyCleanups=*/true);
+
+  // Step 8. Bufferize and drop HAL descriptor from memref ops.
+  variantH = buildBufferize(b, variantH);
+
+  // Step 9. Post-bufferization mapping to blocks and threads.
+  // Need to match again since bufferize invalidated all handles.
+  // TODO: assumes a single func::FuncOp to transform, needs hardening.
+  funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
+  funcH =
+      buildMapToBlockAndThreads(b, funcH,
+                                /*blockSize=*/strategy.numThreads,
+                                /*warpDims=*/strategy.numWarps,
+                                /*subgroupSize=*/strategy.targetSubgroupSize);
+  funcH = b.create<EliminateGpuBarriersOp>(funcH);
+
+  // Step 10. Convert to tensor core ops.
+  // TODO: avoid consuming handles and returning here.
+  funcH = buildConvertToTensorCoreOp(b, funcH, strategy);
+
+  // Step 11. Late lowerings and cleanups.
+  buildLowerVectorMasksAndCleanup(b, funcH);
+}
+
+void iree_compiler::gpu::buildConvolutionTensorCoreStrategy(
+    ImplicitLocOpBuilder &b, Value variantH,
+    const DataTiledConvolutionStrategy &strategy) {
+  LLVM_DEBUG(strategy.print(DBGS()));
+
+  // Step 1. Apply block-level part of the strategy, keeps everything fused.
+  auto [padH, fillH, convH, trailingH, forall] =
+      buildDataTiledConvolutionStrategyBlockDistribution(b, variantH, strategy);
+  buildCommonConvolutionLikeThreadSchedule(b, variantH, padH, fillH, convH,
+                                           trailingH, strategy);
+}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h
new file mode 100644
index 000000000000..4952904b2583
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h
@@ -0,0 +1,208 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_TENSOR_CORE_CONVOLUTION_STRATEGY_H_
+#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_TENSOR_CORE_CONVOLUTION_STRATEGY_H_
+
+#include "iree-dialects/Transforms/TransformMatchers.h"
+#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/MathExtras.h"
+
+namespace llvm {
+class raw_ostream;
+}
+
+namespace mlir {
+namespace iree_compiler {
+namespace gpu {
+
+struct GPUModel;
+
+class DataTiledConvolutionStrategy : public AbstractGemmLikeStrategy {
+public:
+  DataTiledConvolutionStrategy(
+      MLIRContext *context,
+      const transform_ext::MatchedConvolutionCaptures &captures,
+      const GPUModel &gpuModel)
+      : AbstractGemmLikeStrategy(gpuModel), ctx(context), captures(captures) {
+    initDefaultValues(gpuModel);
+  }
+
+  DataTiledConvolutionStrategy(const DataTiledConvolutionStrategy &) = default;
+  DataTiledConvolutionStrategy &
+  operator=(const DataTiledConvolutionStrategy &) = default;
+
+  /// Constructor quantities.
+  MLIRContext *ctx;
+  transform_ext::MatchedConvolutionCaptures captures;
+
+  /// Initialize values from the CLI. Set cliOptionsSpecified to true if the
+  /// default CLI values have been overriden.
+  void initDefaultValues(const GPUModel &gpuModel) override;
+
+  LogicalResult validate(const GPUModel &gpuModel) const override;
+
+  int64_t m() const override {
+    int64_t imgElements = 1;
+    for (auto i : captures.convolutionDims.outputImage) {
+      imgElements *= captures.convolutionOpSizes[i];
+    }
+    return imgElements;
+  }
+  int64_t n() const override {
+    int64_t ocElements = 1;
+    for (auto i : captures.convolutionDims.outputChannel) {
+      ocElements *= captures.convolutionOpSizes[i];
+    }
+    return ocElements;
+  }
+  int64_t k() const override {
+    int64_t icElements = 1;
+    for (auto i : captures.convolutionDims.outputChannel) {
+      icElements *= captures.convolutionOpSizes[i];
+    }
+    return icElements;
+  }
+
+  int64_t blockTileM() const override {
+    assert(blockTileSizes.size() >= 2 && "need at least 2 tile sizes");
+    return blockTileSizes[0];
+  }
+  int64_t blockTileN() const override {
+    assert(blockTileSizes.size() >= 2 && "need at least 2 tile sizes");
+    return blockTileSizes[1];
+  }
+
+  int64_t numWarpsX() const override {
+    assert(numWarps.size() >= 2 && "need at least 2 warp sizes");
+    return numWarps[0];
+  }
+  int64_t numWarpsY() const override {
+    assert(numWarps.size() >= 2 && "need at least 2 warp sizes");
+    return numWarps[1];
+  }
+
+  Type getLhsElementalType() const override {
+    return captures.inputElementType;
+  }
+  Type getRhsElementalType() const override {
+    return captures.filterElementType;
+  }
+  Type getResElementalType() const override {
+    return captures.outputElementType;
+  }
+
+  virtual bool alignedLhs() const { return true; }
+  virtual bool alignedRhs() const { return true; }
+  virtual bool alignedRes() const { return true; }
+
+  bool hasLhsCopy() const override { return true; }
+  // Filter is not copied.
+  bool hasRhsCopy() const override { return false; }
+
+  MappingInfo getBlockMapping() const override {
+    SmallVector<int64_t> tileSizes;
+    SmallVector<Attribute> threadMapping = {blockY(ctx), blockX(ctx)};
+    // Outer output channel.
+    if (captures.convolutionDims.outputChannel.size() == 2) {
+      tileSizes.push_back(blockTileN());
+      threadMapping = {blockZ(ctx), blockY(ctx), blockX(ctx)};
+    }
+    // Image height.
+    tileSizes.push_back(1);
+    // Image width.
+    tileSizes.push_back(blockTileM());
+    return MappingInfo{/*numThreads=*/{},
+                       /*tileSizes=*/tileSizes,
+                       /*threadMapping=*/threadMapping,
+                       /*vectorSize=*/std::nullopt};
+  }
+
+  MappingInfo lhsCopyMapping() const override {
+    int64_t inputTileH =
+        captures.convolutionOpSizes[captures.convolutionDims.filterLoop[0]];
+    int64_t inputTileW =
+        captures.convolutionOpSizes[captures.convolutionDims.filterLoop[1]];
+    +blockTileM() - 1;
+    int64_t icInnerTileSize =
+        captures
+            .convolutionOpSizes[captures.convolutionDims.inputChannel.back()];
+    MappingInfo mapping = CopyMapping::getMappingInfo(
+        ctx, totalNumThreads(),
+        /*alignment=*/k(),
+        /*copySizes=*/
+        ArrayRef<int64_t>{inputTileH, inputTileW, icInnerTileSize},
+        /*favorPredication=*/false,
+        /*elementalBitWidth=*/lhsElementalBitWidth());
+    if (captures.convolutionDims.inputChannel.size() == 2) {
+      mapping.tileSizes.insert(mapping.tileSizes.begin(), 1);
+      mapping.numThreads.insert(mapping.numThreads.begin(), 0);
+    }
+    return mapping;
+  }
+  // TODO: Write a validator.
+  LogicalResult validateLhsCopyMapping() const override { return success(); }
+
+  // Filter is not copied.
+  MappingInfo rhsCopyMapping() const override { return MappingInfo(); }
+  LogicalResult validateRhsCopyMapping() const override { return success(); }
+
+  MappingInfo resCopyMapping() const override {
+    int64_t outputTileH = 1;
+    int64_t outputTileW = blockTileM();
+    int64_t ocInnerTileSize =
+        captures
+            .convolutionOpSizes[captures.convolutionDims.outputChannel.back()];
+    MappingInfo mapping = CopyMapping::getMappingInfo(
+        ctx, totalNumThreads(),
+        /*alignment=*/n(),
+        /*copySizes=*/ArrayRef<int64_t>{blockTileM(), blockTileN()},
+        /*favorPredication=*/false,
+        /*elementalBitWidth=*/resElementalBitWidth());
+    if (captures.convolutionDims.inputChannel.size() == 2) {
+      mapping.tileSizes.insert(mapping.tileSizes.begin(), 1);
+      mapping.numThreads.insert(mapping.numThreads.begin(), 0);
+    }
+    return mapping;
+  }
+  // TODO: Write a validator.
+  LogicalResult validateResCopyMapping() const override { return success(); }
+
+  // COMPUTE is of size mxn.
+  MappingInfo computeMapping() const override {
+    // FMA disabled.
+    // if (useFma) {
+    //   // When using FMA we don't need to map to warps, instead just match
+    //   what
+    //   // the copy does.
+    //   return CopyMapping::getMappingInfo(ctx, totalNumThreads(),
+    //                                      /*alignment=*/n(),
+    //                                      {blockTileM(), blockTileN()});
+    // }
+    return MappingInfo{
+        /*numThreads=*/captures.convolutionDims.outputChannel.size() == 2
+            ? SmallVector<int64_t>{0, 0, numWarpsY(), numWarpsX()}
+            : SmallVector<int64_t>{0, numWarpsY(), numWarpsX()},
+        /*tileSizes=*/{},
+        /*threadMapping=*/{warpY(ctx), warpX(ctx)},
+        /*vectorSize=*/std::nullopt};
+  }
+
+  void print(llvm::raw_ostream &os) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+};
+
+} // namespace gpu
+} // namespace iree_compiler
+} // namespace mlir
+
+#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_TENSOR_CORE_CONVOLUTION_STRATEGY_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
index ea12b1fd3472..172d2aca48ce 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
@@ -15,6 +15,7 @@
 #include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h"
@@ -55,6 +56,11 @@ llvm::cl::opt<bool> clGPUEnableTransformDialectImplicitGemmStrategy(
     "iree-codegen-llvmgpu-enable-transform-dialect-implicit-gemm-strategy",
     llvm::cl::desc("activate the convolution implicit gemm strategy"),
     llvm::cl::init(false));
+llvm::cl::opt<bool> clGPUEnableTransformDialectConvolutionTensorCoreStrategy(
+    "iree-codegen-llvmgpu-enable-transform-dialect-convolution-tensorcore-"
+    "strategy",
+    llvm::cl::desc("activate the convolution tensorcore strategy"),
+    llvm::cl::init(true));
 llvm::cl::opt<bool> clGPUEnableTransformDialectAlignedMatmul(
     "iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul",
     llvm::cl::desc(
@@ -77,6 +83,7 @@ llvm::cl::opt<bool> clGPUEnableTransformDialectBatchMatmulStrategy(
 // TODO: significantly better namespacing.
 using iree_compiler::gpu::AbstractGemmLikeStrategy;
 using iree_compiler::gpu::BatchMatmulStrategy;
+using iree_compiler::gpu::DataTiledConvolutionStrategy;
 using iree_compiler::gpu::GPUModel;
 using iree_compiler::gpu::ImplicitGemmStrategy;
 using iree_compiler::gpu::kCudaMaxVectorLoadBitWidth;
@@ -546,7 +553,7 @@ static LogicalResult matchAndSetMatmulStrategy(func::FuncOp entryPoint,
 /// precedence over other heuristics. In the future, this could be lifted to
 /// e.g. `gpuModel` or higher up in some transform dialect database summary of
 /// "known good things".
-static FailureOr<ImplicitGemmStrategy> applyKnownGoodConvolutionConfigurations(
+static FailureOr<ImplicitGemmStrategy> applyKnownGoodImplicitGemmConfigurations(
     const transform_ext::MatchedConvolutionCaptures &captures,
     const GPUModel &gpuModel) {
   return failure();
@@ -585,15 +592,15 @@ static void failSafeOverrides(ImplicitGemmStrategy &strategy,
 /// The configurations below have been determined empirically.
 // TODO: Significantly improve these heuristics.
 static ImplicitGemmStrategy
-getConvolutionConfig(MLIRContext *context,
-                     const transform_ext::MatchedConvolutionCaptures &captures,
-                     const GPUModel &gpuModel) {
+getImplicitGemmConfig(MLIRContext *context,
+                      const transform_ext::MatchedConvolutionCaptures &captures,
+                      const GPUModel &gpuModel) {
   ImplicitGemmStrategy strategy(context, captures, gpuModel);
   if (strategy.cliOptionsSpecified)
     return strategy;
 
   auto maybeHardcodedConfiguration =
-      applyKnownGoodConvolutionConfigurations(captures, gpuModel);
+      applyKnownGoodImplicitGemmConfigurations(captures, gpuModel);
   if (succeeded(maybeHardcodedConfiguration))
     return *maybeHardcodedConfiguration;
 
@@ -675,7 +682,7 @@ static LogicalResult matchAndSetConvolutionStrategy(func::FuncOp entryPoint,
   }
 
   iree_compiler::gpu::ImplicitGemmStrategy strategy =
-      getConvolutionConfig(op->getContext(), captures, gpuModel);
+      getImplicitGemmConfig(op->getContext(), captures, gpuModel);
 
   // Validate the strategy configuration against the compilation target.
   if (failed(strategy.validate(gpuModel))) {
@@ -695,6 +702,110 @@ static LogicalResult matchAndSetConvolutionStrategy(func::FuncOp entryPoint,
   return success();
 }
 
+static FailureOr<DataTiledConvolutionStrategy>
+applyKnownGoodConvolutionConfigurations(
+    const transform_ext::MatchedConvolutionCaptures &captures,
+    const GPUModel &gpuModel) {
+  return failure();
+}
+
+static void failSafeOverrides(DataTiledConvolutionStrategy &strategy,
+                              const GPUModel &gpuModel) {}
+
+/// The configurations below have been determined empirically.
+// TODO: Significantly improve these heuristics.
+static DataTiledConvolutionStrategy
+getConvolutionConfig(MLIRContext *context,
+                     const transform_ext::MatchedConvolutionCaptures &captures,
+                     const GPUModel &gpuModel) {
+  DataTiledConvolutionStrategy strategy(context, captures, gpuModel);
+  if (strategy.cliOptionsSpecified)
+    return strategy;
+
+  auto maybeHardcodedConfiguration =
+      applyKnownGoodConvolutionConfigurations(captures, gpuModel);
+  if (succeeded(maybeHardcodedConfiguration))
+    return *maybeHardcodedConfiguration;
+
+  // TODO: encode a decision tree of reasonnable heuristics here.
+
+  // Apply failsafe overrides to avoid identified bad corner cases.
+  failSafeOverrides(strategy, gpuModel);
+
+  return strategy;
+}
+
+static LogicalResult matchAndSetDataTiledConvolutionStrategy(
+    func::FuncOp entryPoint, linalg::LinalgOp op, const GPUModel &gpuModel) {
+  if (!clGPUEnableTransformDialectConvolutionTensorCoreStrategy) {
+    LDBG("--Convolution strategy flag turned off\n");
+    return failure();
+  }
+
+  // 1. Match a reduction and surrounding ops.
+  CapturingOpMatcher *pad;
+  StructuredOpMatcher *fill;
+  StructuredOpMatcher *convolution;
+  StructuredOpMatcher *trailing;
+  transform_ext::MatchedConvolutionCaptures captures;
+  transform_ext::MatcherContext matcherContext;
+  makeConvolutionMatcher(matcherContext, convolution, pad, fill, trailing,
+                         captures,
+                         /*mustMatchEntireFunc=*/true);
+  if (!matchPattern(op, *convolution)) {
+    LDBG("--Convolution strategy fail to match\n");
+    return failure();
+  }
+
+  if (!fill->getCaptured() || pad->getCaptured()) {
+    LDBG("--Convolution strategy capture preconditions failed\n");
+    return failure();
+  }
+
+  if (captures.convolutionDims.outputImage.size() != 2) {
+    return failure();
+  }
+  if (captures.convolutionDims.filterLoop.size() != 2) {
+    return failure();
+  }
+  if (captures.convolutionDims.batch.size() != 0) {
+    return failure();
+  }
+
+  // int64_t channelSize = 1;
+  // for (auto dim : captures.convolutionDims.outputChannel)
+  //   channelSize *= captures.convolutionOpSizes[dim];
+  // int64_t imageSize = 1;
+  // for (auto dim : captures.convolutionDims.outputImage)
+  //   imageSize *= captures.convolutionOpSizes[dim];
+
+  // int64_t derivedK = 1;
+  // for (auto dim : captures.convolutionDims.filterLoop)
+  //   derivedK *= captures.convolutionOpSizes[dim];
+  // for (auto dim : captures.convolutionDims.inputChannel)
+  //   derivedK *= captures.convolutionOpSizes[dim];
+
+  iree_compiler::gpu::DataTiledConvolutionStrategy strategy =
+      getConvolutionConfig(op->getContext(), captures, gpuModel);
+
+  // Validate the strategy configuration against the compilation target.
+  if (failed(strategy.validate(gpuModel))) {
+    LDBG("--Convolution strategy failed to validate\n");
+    return failure();
+  }
+
+  // 2. Construct the configuration and the strategy builder.
+  // TODO: Generalize along the HW axis.
+  auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) {
+    return buildConvolutionTensorCoreStrategy(b, variant, strategy);
+  };
+
+  // 3. Build strategy embedded into the IR.
+  mlir::iree_compiler::createTransformRegion(entryPoint, strategyBuilder);
+
+  return success();
+}
+
 //===--------------------------------------------------------------------===//
 // Pad strategies.
 //===--------------------------------------------------------------------===//
@@ -809,6 +920,11 @@ LogicalResult mlir::iree_compiler::gpu::matchAndSetTransformStrategy(
     LDBG("Activate batch matmul\n");
     return success();
   }
+  if (succeeded(matchAndSetDataTiledConvolutionStrategy(entryPoint, linalgOp,
+                                                        gpuModel))) {
+    LDBG("Activate convolution\n");
+    return success();
+  }
   if (succeeded(
           matchAndSetConvolutionStrategy(entryPoint, linalgOp, gpuModel))) {
     LDBG("Activate convolution\n");
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
index 5f7e051c91ad..8b6e3960b0f4 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
@@ -17,8 +17,9 @@ namespace iree_compiler {
 namespace gpu {
 
 /// Forward declarations of all supported strategies.
-struct BatchMatmulStrategy;
-struct MatmulStrategy;
+class BatchMatmulStrategy;
+class MatmulStrategy;
+class DataTiledConvolutionStrategy;
 class PadStrategy;
 class SmallReductionStrategy;
 class StagedReductionStrategy;
@@ -88,6 +89,13 @@ void buildMatmulTensorCoreStrategy(ImplicitLocOpBuilder &b, Value variantH,
 void buildBatchMatmulStrategy(ImplicitLocOpBuilder &b, Value variantH,
                               const BatchMatmulStrategy &strategy);
 
+//===--------------------------------------------------------------------===//
+// Convolution strategies.
+//===--------------------------------------------------------------------===//
+void buildConvolutionTensorCoreStrategy(
+    ImplicitLocOpBuilder &b, Value variantH,
+    const DataTiledConvolutionStrategy &strategy);
+
 //===--------------------------------------------------------------------===//
 // Pad strategies.
 //===--------------------------------------------------------------------===//

From 5397313e65993c3bab11241d67c704c710dbbd4b Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Mon, 10 Jul 2023 04:07:07 -0400
Subject: [PATCH 06/12] [TransformStrategies] Add direct fma conv without
 shared memory

Adds a direct SIMT(/fma/dot4) conv approach without shared memory.
---
 .../TransformStrategies/GPU/BUILD.bazel       |   2 +
 .../TransformStrategies/GPU/CMakeLists.txt    |   2 +
 .../GPU/ConvolutionStrategy.cpp               | 322 ++++++++++++++++++
 .../GPU/ConvolutionStrategy.h                 | 142 ++++++++
 .../TransformStrategies/GPU/Strategies.cpp    |  88 ++++-
 .../TransformStrategies/GPU/Strategies.h      |   3 +
 6 files changed, 553 insertions(+), 6 deletions(-)
 create mode 100644 compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.cpp
 create mode 100644 compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.h

diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel
index d2e3bba16743..8c9731678085 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel
@@ -18,6 +18,7 @@ iree_compiler_cc_library(
         "AbstractGemmLikeStrategy.cpp",
         "Common.cpp",
         "ConvolutionImplicitGemmStrategy.cpp",
+        "ConvolutionStrategy.cpp",
         "ConvolutionTensorCoreStrategy.cpp",
         "CopyMapping.cpp",
         "MappingInfo.cpp",
@@ -31,6 +32,7 @@ iree_compiler_cc_library(
         "AbstractGemmLikeStrategy.h",
         "Common.h",
         "ConvolutionImplicitGemmStrategy.h",
+        "ConvolutionStrategy.h",
         "ConvolutionTensorCoreStrategy.h",
         "CopyMapping.h",
         "MappingInfo.h",
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt
index 3ae45c54c59f..d00bd8e2bfd1 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt
@@ -17,6 +17,7 @@ iree_cc_library(
     "AbstractGemmLikeStrategy.h"
     "Common.h"
     "ConvolutionImplicitGemmStrategy.h"
+    "ConvolutionStrategy.h"
     "ConvolutionTensorCoreStrategy.h"
     "CopyMapping.h"
     "MappingInfo.h"
@@ -29,6 +30,7 @@ iree_cc_library(
     "AbstractGemmLikeStrategy.cpp"
     "Common.cpp"
     "ConvolutionImplicitGemmStrategy.cpp"
+    "ConvolutionStrategy.cpp"
     "ConvolutionTensorCoreStrategy.cpp"
     "CopyMapping.cpp"
     "MappingInfo.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.cpp
new file mode 100644
index 000000000000..3dc899d555a4
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.cpp
@@ -0,0 +1,322 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.h"
+
+#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
+#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
+#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
+#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
+#include "mlir/Dialect/MemRef/TransformOps/MemRefTransformOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Transform/IR/TransformAttrs.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+#include "mlir/Support/MathExtras.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "iree-transform-builder"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+// TODO: significantly better namespacing.
+using iree_compiler::buildPad;
+using iree_compiler::buildSelectFirstNonEmpty;
+using iree_compiler::buildTileFuseDistToForallWithNumThreads;
+using iree_compiler::buildTileFuseDistToForallWithTileSizes;
+using iree_compiler::TileToForallAndFuseAndDistributeResult;
+using iree_compiler::gpu::buildBufferize;
+using iree_compiler::gpu::buildConvertToAsyncCopies;
+using iree_compiler::gpu::buildConvertToTensorCoreOp;
+using iree_compiler::gpu::buildDistributeMatmulCopies;
+using iree_compiler::gpu::ConvolutionStrategy;
+using iree_compiler::gpu::MappingInfo;
+using iree_compiler::IREE::transform_dialect::ApplyBufferOptimizationsOp;
+using iree_compiler::IREE::transform_dialect::
+    ApplyFoldReshapeIntoTensorHalInterfacePatternsOp;
+using iree_compiler::IREE::transform_dialect::EliminateGpuBarriersOp;
+using iree_compiler::IREE::transform_dialect::
+    IREEPopulateWorkgroupCountRegionUsingNumThreadsSliceOp;
+using transform::FuseIntoContainingOp;
+using transform::MatchOp;
+using transform_ext::RegisterMatchCallbacksOp;
+
+static llvm::cl::list<int64_t> clBlockTileSizes(
+    "td-convolution-strategy-blk-sizes",
+    llvm::cl::desc("block tile size for dims (x,y,z) for the transform "
+                   "dialect convolution strategy"),
+    llvm::cl::CommaSeparated);
+static llvm::cl::list<int64_t> clNumThreads(
+    "td-convolution-strategy-num-threads",
+    llvm::cl::desc("number of threads for dims (x,y,z) for the transform "
+                   "dialect convolution strategy"),
+    llvm::cl::CommaSeparated);
+static llvm::cl::list<int64_t> clNumWarps(
+    "td-convolution-strategy-num-warps",
+    llvm::cl::desc("number of warps for dims (x,y,z) for the transform "
+                   "dialect convolution strategy"),
+    llvm::cl::CommaSeparated);
+
+void ConvolutionStrategy::initDefaultValues(const GPUModel &gpuModel) {
+  blockTileSizes =
+      SmallVector<int64_t>{clBlockTileSizes.begin(), clBlockTileSizes.end()};
+  numThreads = SmallVector<int64_t>{clNumThreads.begin(), clNumThreads.end()};
+  numWarps = SmallVector<int64_t>{clNumWarps.begin(), clNumWarps.end()};
+
+  /// Default configuration based on hardware properties and problem bit widths.
+  if (clBlockTileSizes.getNumOccurrences()) {
+    blockTileSizes =
+        SmallVector<int64_t>(clBlockTileSizes.begin(), clBlockTileSizes.end());
+  } else {
+    blockTileSizes = SmallVector<int64_t>{4, 16, 1};
+    while (
+        captures
+            .convolutionOpSizes[captures.convolutionDims.outputImage.front()] %
+        blockTileSizes[0])
+      blockTileSizes[0] /= 2;
+  }
+
+  if (clNumThreads.getNumOccurrences()) {
+    numThreads = SmallVector<int64_t>(clNumThreads.begin(), clNumThreads.end());
+  } else {
+    // Infer from warp counts if present.
+    if (clNumWarps.getNumOccurrences()) {
+      numThreads = SmallVector<int64_t>(clNumWarps.begin(), clNumWarps.end());
+      numThreads[0] *= subgroupSize;
+    } else {
+      numThreads = SmallVector<int64_t>{64, 1, 1};
+    }
+  }
+  if (clNumWarps.getNumOccurrences()) {
+    numWarps = SmallVector<int64_t>(clNumWarps.begin(), clNumWarps.end());
+  } else {
+    numWarps = numThreads;
+    numWarps[0] = mlir::ceilDiv(numWarps[0], subgroupSize);
+  }
+}
+
+LLVM_DUMP_METHOD void ConvolutionStrategy::dump() const { print(llvm::errs()); }
+
+void ConvolutionStrategy::print(llvm::raw_ostream &os) const {
+  os << "\n--- Convolution strategy ---\n";
+  os << "- block tile sizes: {";
+  bool isFirst = true;
+  for (int64_t blockTileSize : blockTileSizes) {
+    if (!isFirst)
+      os << ", ";
+    os << blockTileSize;
+    isFirst = false;
+  }
+  os << "}\n";
+  os << "- number of threads: {";
+  isFirst = true;
+  for (int64_t numThreadsForDim : numThreads) {
+    if (!isFirst)
+      os << ", ";
+    os << numThreadsForDim;
+    isFirst = false;
+  }
+  os << "}\n";
+
+  os << "- number of warps: {";
+  isFirst = true;
+  for (int64_t numWarpsForDim : numWarps) {
+    if (!isFirst)
+      os << ", ";
+    os << numWarpsForDim;
+    isFirst = false;
+  }
+  os << "\n-- Derived quantities --\n";
+  os << "- block mapping:\n";
+  getBlockMapping().print(os << "    -> ");
+  os << "- compute mapping:\n";
+  computeMapping().print(os << "    -> ");
+}
+
+// TODO: implement validator.
+LogicalResult ConvolutionStrategy::validate(const GPUModel &gpuModel) const {
+  return success();
+}
+
+static std::tuple<Value, Value, Value, Value, Value>
+buildConvolutionStrategyBlockDistribution(ImplicitLocOpBuilder &b,
+                                          Value variantH,
+                                          const ConvolutionStrategy &strategy) {
+  // Step 1. Call the matcher. Note that this is the same matcher as used to
+  // trigger this compilation path, so it must always apply.
+  b.create<RegisterMatchCallbacksOp>();
+  auto [padH, fillH, convH, maybeTrailingH] = unpackRegisteredMatchCallback<4>(
+      b, "convolution", transform::FailurePropagationMode::Propagate, variantH);
+
+  // Step 2. Create the block/mapping tiling level and fusee.
+  auto [fusionTargetH, fusionGroupH] =
+      buildSelectFirstNonEmpty(b, maybeTrailingH, convH);
+  MappingInfo blockMapping = strategy.getBlockMapping();
+  TileToForallAndFuseAndDistributeResult tileResult =
+      buildTileFuseDistToForallWithTileSizes(
+          /*builder=*/b,
+          /*variantH=*/variantH,
+          /*rootH=*/fusionTargetH,
+          /*opsToFuseH=*/fusionGroupH,
+          /*tileSizes=*/
+          getAsOpFoldResult(b.getI64ArrayAttr(blockMapping.tileSizes)),
+          /*threadDimMapping=*/
+          b.getArrayAttr(blockMapping.threadMapping));
+
+  auto [blockConvH, maybeBlockTrailingH] = buildSelectFirstNonEmpty(
+      b, tileResult.resultingFusedOpsHandles.front(), tileResult.tiledOpH);
+
+  Value fusedPadH =
+      b.create<FuseIntoContainingOp>(padH, tileResult.forallH).getFusedOp();
+  Value fusedFillH =
+      b.create<FuseIntoContainingOp>(fillH, tileResult.forallH).getFusedOp();
+
+  // Handle the workgroup count region.
+  b.create<IREEPopulateWorkgroupCountRegionUsingNumThreadsSliceOp>(
+      tileResult.forallH);
+
+  return std::make_tuple(fusedPadH, fusedFillH, blockConvH, maybeBlockTrailingH,
+                         tileResult.forallH);
+}
+
+/// Builds the common part of the schedule for matmuls and batched matmuls.
+static void buildCommonConvolutionLikeThreadSchedule(
+    ImplicitLocOpBuilder &b, Value variantH, Value padH, Value fillH,
+    Value convH, Value trailingH, const ConvolutionStrategy &strategy) {
+  using mlir::iree_compiler::buildLowerVectorMasksAndCleanup;
+  using mlir::iree_compiler::buildTileFuseToScfFor;
+  using namespace mlir::iree_compiler::gpu;
+
+  // Tile the outer input channel dimension.
+  if (strategy.captures.convolutionDims.inputChannel.size() > 1) {
+    SmallVector<int64_t> tileSizes(
+        strategy.captures.convolutionDims.outputChannel.size(), 0);
+    tileSizes.append(strategy.captures.convolutionDims.outputImage.size(), 0);
+    // tileSizes.append(strategy.captures.convolutionDims.filterLoop.size(), 0);
+    tileSizes.push_back(1);
+
+    // Avoid canonicalizing before the pad to avoid folding away the
+    // extract_slice on the output needed to hoist the output pad.
+    auto tileReductionResult = buildTileFuseToScfFor(
+        b, variantH, convH, {}, getAsOpFoldResult(b.getI64ArrayAttr(tileSizes)),
+        /*canonicalize=*/false);
+    convH = tileReductionResult.tiledOpH;
+  }
+
+  Value funcH =
+      b.create<transform::MatchOp>(variantH, func::FuncOp::getOperationName());
+  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
+
+  // Step 5. Tile the filter loop dimensions.
+  SmallVector<int64_t> tileSizes(
+      strategy.captures.convolutionDims.outputChannel.size(), 0);
+  tileSizes.append(strategy.captures.convolutionDims.outputImage.size(), 0);
+  tileSizes.append(strategy.captures.convolutionDims.filterLoop.size(), 1);
+
+  auto tileReductionResult = buildTileFuseToScfFor(
+      b, variantH, convH, {}, getAsOpFoldResult(b.getI64ArrayAttr(tileSizes)),
+      /*canonicalize=*/true);
+  Value filterTiledConvH = tileReductionResult.tiledOpH;
+
+  // Step 6. Distribute to threads: SIMT programming model.
+  MappingInfo computeMapping = strategy.computeMapping();
+  buildTileFuseDistToForallWithNumThreads(
+      b, variantH, filterTiledConvH, ValueRange(),
+      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
+      b.getArrayAttr(computeMapping.threadMapping));
+  buildTileFuseDistToForallWithNumThreads(
+      b, variantH, fillH, ValueRange(),
+      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
+      b.getArrayAttr(computeMapping.threadMapping));
+  buildTileFuseDistToForallWithNumThreads(
+      b, variantH, trailingH, ValueRange(),
+      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
+      b.getArrayAttr(computeMapping.threadMapping));
+
+  // Step 7. Apply vectorization + cleanups to what remains.
+  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
+    b.create<ApplyFoldReshapeIntoTensorHalInterfacePatternsOp>(loc);
+    b.create<transform::ApplyFoldUnitExtentDimsViaSlicesPatternsOp>(loc);
+    b.create<transform::ApplyCastAwayVectorLeadingOneDimPatternsOp>(loc);
+  });
+  funcH = iree_compiler::buildVectorize(b, funcH, /*applyCleanups=*/true);
+
+  // Step 8. Bufferize and drop HAL descriptor from memref ops.
+  variantH = buildBufferize(b, variantH);
+
+  // Step 9. Post-bufferization mapping to blocks and threads.
+  // Need to match again since bufferize invalidated all handles.
+  // TODO: assumes a single func::FuncOp to transform, needs hardening.
+  funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
+  funcH = buildMapToBlockAndThreads(b, funcH,
+                                    /*blockSize=*/strategy.numThreads,
+                                    /*warpDims=*/strategy.numWarps,
+                                    /*subgroupSize=*/strategy.subgroupSize);
+  // This currently spins forever.
+  // funcH = b.create<EliminateGpuBarriersOp>(funcH);
+
+  // Step 10. Cleanup.
+  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
+  b.create<iree_compiler::IREE::transform_dialect::HoistStaticAllocOp>(funcH);
+  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
+    b.create<transform::ApplyFoldMemrefAliasOpsPatternsOp>(loc);
+  });
+  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
+    b.create<transform::ApplyExtractAddressComputationsPatternsOp>(loc);
+  });
+  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
+
+  // Value forH = b.create<transform::MatchOp>(
+  //     transform::OperationType::get(b.getContext(), "scf.for"), funcH,
+  //     b.getStrArrayAttr({scf::ForOp::getOperationName()}),
+  //     /*matchInterfaceEnum=*/transform::MatchInterfaceEnumAttr(),
+  //     /*opAttrs=*/DictionaryAttr(),
+  //     /*filterResultType=*/TypeAttr());
+  // // TODO: At this time, this synchronization is needed for applying the
+  // // HoistRedundantVectorTransfersOp transform correctly. This is because the
+  // // transform does not take parallelism into accound.
+  // // In the future, HoistRedundantVectorTransfersOp + SynchronizeLoopOp need
+  // to
+  // // be replaced by a single transform.
+  // b.create<SynchronizeLoopOp>(forH);
+
+  // TODO: not a functional style transform and avoid returning funcH.
+  // funcH = b.create<transform::HoistRedundantVectorTransfersOp>(
+  //    transform::AnyOpType::get(b.getContext()), funcH);
+  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
+  b.create<ApplyBufferOptimizationsOp>(funcH);
+
+  // // Post-hoc elimiation of barriers.
+  // funcH = b.create<EliminateGpuBarriersOp>(funcH);
+
+  // Step 11. Late lowerings and cleanups.
+  buildLowerVectorMasksAndCleanup(b, funcH);
+}
+
+void iree_compiler::gpu::buildConvolutionStrategy(
+    ImplicitLocOpBuilder &b, Value variantH,
+    const ConvolutionStrategy &strategy) {
+  LLVM_DEBUG(strategy.print(DBGS()));
+
+  // Step 1. Apply block-level part of the strategy, keeps everything fused.
+  auto [padH, fillH, convH, trailingH, forall] =
+      buildConvolutionStrategyBlockDistribution(b, variantH, strategy);
+  buildCommonConvolutionLikeThreadSchedule(b, variantH, padH, fillH, convH,
+                                           trailingH, strategy);
+}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.h
new file mode 100644
index 000000000000..08b3c9999b0e
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.h
@@ -0,0 +1,142 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_CONVOLUTION_STRATEGY_H_
+#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_CONVOLUTION_STRATEGY_H_
+
+#include "iree-dialects/Transforms/TransformMatchers.h"
+#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Support/LogicalResult.h"
+
+namespace llvm {
+class raw_ostream;
+}
+
+namespace mlir {
+namespace iree_compiler {
+namespace gpu {
+
+struct GPUModel;
+
+class ConvolutionStrategy : public GPUStrategy {
+public:
+  ConvolutionStrategy(MLIRContext *context,
+                      const transform_ext::MatchedConvolutionCaptures &captures,
+                      const GPUModel &gpuModel)
+      : GPUStrategy(gpuModel), ctx(context), captures(captures) {
+    initDefaultValues(gpuModel);
+  }
+
+  ConvolutionStrategy(const ConvolutionStrategy &) = default;
+  ConvolutionStrategy &operator=(const ConvolutionStrategy &) = default;
+
+  /// Constructor quantities.
+  MLIRContext *ctx;
+  transform_ext::MatchedConvolutionCaptures captures;
+
+  /// Initialize values from the CLI.
+  void initDefaultValues(const GPUModel &gpuModel);
+
+  LogicalResult validate(const GPUModel &gpuModel) const;
+
+  //===--------------------------------------------------------------------===//
+  // Parameters that control the tiling and mapping.
+  //===--------------------------------------------------------------------===//
+
+  /// Tile sizes for the workgroup / determines grid size for all known
+  /// reduction strategies. The initial values are set by initDefaultValues();
+  SmallVector<int64_t> blockTileSizes;
+  int64_t reductionTileSize;
+  SmallVector<int64_t> numThreads;
+  SmallVector<int64_t> numWarps;
+
+  /// Common values based on derived quantities.
+  int64_t totalNumThreads() const {
+    int64_t res = 1;
+    for (auto v : numThreads)
+      res *= v;
+    return res;
+  }
+
+  int64_t totalNumWarps() const {
+    int64_t res = 1;
+    for (auto v : numWarps)
+      res *= v;
+    return res;
+  }
+
+  int64_t blockTileH() const {
+    assert(blockTileSizes.size() >= 2 && "need at least 2 tile sizes");
+    return blockTileSizes[0];
+  }
+  int64_t blockTileW() const {
+    assert(blockTileSizes.size() >= 2 && "need at least 2 tile sizes");
+    return blockTileSizes[1];
+  }
+
+  // int64_t numWarpsX() const {
+  //   assert(numWarps.size() >= 2 && "need at least 2 warp sizes");
+  //   return numWarps[0];
+  // }
+  // int64_t numWarpsY() const {
+  //   assert(numWarps.size() >= 2 && "need at least 2 warp sizes");
+  //   return numWarps[1];
+  // }
+
+  MappingInfo getBlockMapping() const {
+    SmallVector<int64_t> tileSizes;
+    SmallVector<Attribute> threadMapping = {blockY(ctx), blockX(ctx)};
+    // Outer output channel.
+    if (captures.convolutionDims.outputChannel.size() == 2) {
+      tileSizes.push_back(1);
+      threadMapping = {blockZ(ctx), blockY(ctx), blockX(ctx)};
+    }
+    // Image height.
+    tileSizes.push_back(blockTileH());
+    // Image width.
+    tileSizes.push_back(blockTileW());
+    return MappingInfo{/*numThreads=*/{},
+                       /*tileSizes=*/tileSizes,
+                       /*threadMapping=*/threadMapping,
+                       /*vectorSize=*/std::nullopt};
+  }
+
+  MappingInfo computeMapping() const {
+    int64_t innerOcTileSize =
+        captures
+            .convolutionOpSizes[captures.convolutionDims.outputChannel.back()];
+    MappingInfo mapping = CopyMapping::getMappingInfo(
+        ctx, totalNumThreads(),
+        /*alignment=*/innerOcTileSize,
+        {blockTileH(), blockTileW(), innerOcTileSize});
+    if (captures.convolutionDims.outputChannel.size() == 2) {
+      mapping.tileSizes.insert(mapping.tileSizes.begin(), 1);
+      mapping.numThreads.insert(mapping.numThreads.begin(), 0);
+    }
+    return mapping;
+    // return MappingInfo{
+    //     /*numThreads=*/captures.convolutionDims.outputChannel.size() == 2
+    //         ? SmallVector<int64_t>{0, 0, numWarpsY(), numWarpsX()}
+    //         : SmallVector<int64_t>{0, numWarpsY(), numWarpsX()},
+    //     /*tileSizes=*/{},
+    //     /*threadMapping=*/{warpY(ctx), warpX(ctx)},
+    //     /*vectorSize=*/std::nullopt};
+  }
+
+  void print(llvm::raw_ostream &os) const;
+  LLVM_DUMP_METHOD void dump() const;
+};
+
+} // namespace gpu
+} // namespace iree_compiler
+} // namespace mlir
+
+#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_CONVOLUTION_STRATEGY_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
index 172d2aca48ce..5004b4864e13 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
@@ -15,6 +15,7 @@
 #include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h"
@@ -61,6 +62,9 @@ llvm::cl::opt<bool> clGPUEnableTransformDialectConvolutionTensorCoreStrategy(
     "strategy",
     llvm::cl::desc("activate the convolution tensorcore strategy"),
     llvm::cl::init(true));
+llvm::cl::opt<bool> clGPUEnableTransformDialectConvolutionStrategy(
+    "iree-codegen-llvmgpu-enable-transform-dialect-convolution-strategy",
+    llvm::cl::desc("activate the convolution strategy"), llvm::cl::init(true));
 llvm::cl::opt<bool> clGPUEnableTransformDialectAlignedMatmul(
     "iree-codegen-llvmgpu-enable-transform-dialect-aligned-matmul",
     llvm::cl::desc(
@@ -83,6 +87,7 @@ llvm::cl::opt<bool> clGPUEnableTransformDialectBatchMatmulStrategy(
 // TODO: significantly better namespacing.
 using iree_compiler::gpu::AbstractGemmLikeStrategy;
 using iree_compiler::gpu::BatchMatmulStrategy;
+using iree_compiler::gpu::ConvolutionStrategy;
 using iree_compiler::gpu::DataTiledConvolutionStrategy;
 using iree_compiler::gpu::GPUModel;
 using iree_compiler::gpu::ImplicitGemmStrategy;
@@ -612,9 +617,8 @@ getImplicitGemmConfig(MLIRContext *context,
   return strategy;
 }
 
-static LogicalResult matchAndSetConvolutionStrategy(func::FuncOp entryPoint,
-                                                    linalg::LinalgOp op,
-                                                    const GPUModel &gpuModel) {
+static LogicalResult matchAndSetConvolutionImplicitGemmStrategy(
+    func::FuncOp entryPoint, linalg::LinalgOp op, const GPUModel &gpuModel) {
   if (!clGPUEnableTransformDialectImplicitGemmStrategy) {
     LDBG("--Implicit gemm strategy flag turned off\n");
     return failure();
@@ -715,7 +719,7 @@ static void failSafeOverrides(DataTiledConvolutionStrategy &strategy,
 /// The configurations below have been determined empirically.
 // TODO: Significantly improve these heuristics.
 static DataTiledConvolutionStrategy
-getConvolutionConfig(MLIRContext *context,
+getDataTiledConvolutionConfig(MLIRContext *context,
                      const transform_ext::MatchedConvolutionCaptures &captures,
                      const GPUModel &gpuModel) {
   DataTiledConvolutionStrategy strategy(context, captures, gpuModel);
@@ -786,7 +790,7 @@ static LogicalResult matchAndSetDataTiledConvolutionStrategy(
   //   derivedK *= captures.convolutionOpSizes[dim];
 
   iree_compiler::gpu::DataTiledConvolutionStrategy strategy =
-      getConvolutionConfig(op->getContext(), captures, gpuModel);
+      getDataTiledConvolutionConfig(op->getContext(), captures, gpuModel);
 
   // Validate the strategy configuration against the compilation target.
   if (failed(strategy.validate(gpuModel))) {
@@ -806,6 +810,73 @@ static LogicalResult matchAndSetDataTiledConvolutionStrategy(
   return success();
 }
 
+/// The configurations below have been determined empirically.
+// TODO: Significantly improve these heuristics.
+static ConvolutionStrategy
+getDirectConvolutionConfig(MLIRContext *context,
+                     const transform_ext::MatchedConvolutionCaptures &captures,
+                     const GPUModel &gpuModel) {
+  return ConvolutionStrategy(context, captures, gpuModel);
+}
+
+static LogicalResult matchAndSetDirectConvolutionStrategy(
+    func::FuncOp entryPoint, linalg::LinalgOp op, const GPUModel &gpuModel) {
+  if (!clGPUEnableTransformDialectConvolutionStrategy) {
+    LDBG("--Convolution strategy flag turned off\n");
+    return failure();
+  }
+
+  // 1. Match a reduction and surrounding ops.
+  CapturingOpMatcher *pad;
+  StructuredOpMatcher *fill;
+  StructuredOpMatcher *convolution;
+  StructuredOpMatcher *trailing;
+  transform_ext::MatchedConvolutionCaptures captures;
+  transform_ext::MatcherContext matcherContext;
+  makeConvolutionMatcher(matcherContext, convolution, pad, fill, trailing,
+                         captures,
+                         /*mustMatchEntireFunc=*/true);
+  if (!matchPattern(op, *convolution)) {
+    LDBG("--Convolution strategy fail to match\n");
+    return failure();
+  }
+
+  if (!fill->getCaptured() || pad->getCaptured()) {
+    LDBG("--Convolution strategy capture preconditions failed\n");
+    return failure();
+  }
+
+  if (captures.convolutionDims.outputImage.size() != 2) {
+    return failure();
+  }
+  if (captures.convolutionDims.filterLoop.size() != 2) {
+    return failure();
+  }
+  if (captures.convolutionDims.batch.size() != 0) {
+    return failure();
+  }
+
+  iree_compiler::gpu::ConvolutionStrategy strategy =
+      getDirectConvolutionConfig(op->getContext(), captures, gpuModel);
+
+  // Validate the strategy configuration against the compilation target.
+  if (failed(strategy.validate(gpuModel))) {
+    LDBG("--Convolution strategy failed to validate\n");
+    return failure();
+  }
+
+  // 2. Construct the configuration and the strategy builder.
+  // TODO: Generalize along the HW axis.
+  auto strategyBuilder = [&](ImplicitLocOpBuilder &b, Value variant) {
+    return buildConvolutionStrategy(b, variant, strategy);
+  };
+
+  // 3. Build strategy embedded into the IR.
+  mlir::iree_compiler::createTransformRegion(entryPoint, strategyBuilder);
+
+  return success();
+}
+
 //===--------------------------------------------------------------------===//
 // Pad strategies.
 //===--------------------------------------------------------------------===//
@@ -925,8 +996,13 @@ LogicalResult mlir::iree_compiler::gpu::matchAndSetTransformStrategy(
     LDBG("Activate convolution\n");
     return success();
   }
+  if (succeeded(matchAndSetConvolutionImplicitGemmStrategy(entryPoint, linalgOp,
+                                                           gpuModel))) {
+    LDBG("Activate convolution\n");
+    return success();
+  }
   if (succeeded(
-          matchAndSetConvolutionStrategy(entryPoint, linalgOp, gpuModel))) {
+          matchAndSetDirectConvolutionStrategy(entryPoint, linalgOp, gpuModel))) {
     LDBG("Activate convolution\n");
     return success();
   }
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
index 8b6e3960b0f4..bb2555092e56 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
@@ -19,6 +19,7 @@ namespace gpu {
 /// Forward declarations of all supported strategies.
 class BatchMatmulStrategy;
 class MatmulStrategy;
+class ConvolutionStrategy;
 class DataTiledConvolutionStrategy;
 class PadStrategy;
 class SmallReductionStrategy;
@@ -95,6 +96,8 @@ void buildBatchMatmulStrategy(ImplicitLocOpBuilder &b, Value variantH,
 void buildConvolutionTensorCoreStrategy(
     ImplicitLocOpBuilder &b, Value variantH,
     const DataTiledConvolutionStrategy &strategy);
+void buildConvolutionStrategy(ImplicitLocOpBuilder &b, Value variantH,
+                              const ConvolutionStrategy &strategy);
 
 //===--------------------------------------------------------------------===//
 // Pad strategies.

From 381bc5018ee6256429ae9fa6460a38e2c382b660 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Mon, 10 Jul 2023 05:31:38 -0400
Subject: [PATCH 07/12] [TransformMatchers] Add matcher for any contraction

Allows matching non-named contraction ops, using the same
MatmulOpCaptures struct that exists for matmul and batch matmul
---
 .../Transforms/TransformMatchers.h            |  7 +++
 .../IR/StructuredTransformOpsExt.cpp          | 57 +++++++++++++++++++
 .../lib/Transforms/TransformMatchers.cpp      | 27 +++++++++
 3 files changed, 91 insertions(+)

diff --git a/llvm-external-projects/iree-dialects/include/iree-dialects/Transforms/TransformMatchers.h b/llvm-external-projects/iree-dialects/include/iree-dialects/Transforms/TransformMatchers.h
index ebcc6f9ed70f..1e2c537eea4b 100644
--- a/llvm-external-projects/iree-dialects/include/iree-dialects/Transforms/TransformMatchers.h
+++ b/llvm-external-projects/iree-dialects/include/iree-dialects/Transforms/TransformMatchers.h
@@ -1118,6 +1118,13 @@ void makeBatchMatmulMatcher(transform_ext::MatcherContext &matcherContext,
                             transform_ext::MatchedMatmulCaptures &captures,
                             bool mustMatchEntireFunc);
 
+void makeAnyContractionMatcher(MatcherContext &matcherContext,
+                               StructuredOpMatcher *&dtmCapture,
+                               StructuredOpMatcher *&fillCapture,
+                               StructuredOpMatcher *&trailingCapture,
+                               MatchedMatmulCaptures &captures,
+                               bool mustMatchEntireFunc);
+
 /// Create a group of matchers for a different code sequence of operations
 /// matching exactly a softmax operation.
 ///
diff --git a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp
index 1391ef47c7cf..4386a9ea86fe 100644
--- a/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Dialect/LinalgTransform/IR/StructuredTransformOpsExt.cpp
@@ -913,6 +913,62 @@ batchMatmulCallback(transform_ext::MatchCallbackResult &res, Location loc,
   return emitSilenceableFailure(loc) << "failed to match batch matmul";
 }
 
+/// Match callback for linalg.batch_matmul and its linalg.generic equivalent fed
+/// by a linalg.fill.
+///
+/// Input handles:
+///
+///   - the container op, must be associated with one operation.
+///
+/// Output handles:
+///
+///   - the fill op initializing the output;
+///   - the main compute op.
+static DiagnosedSilenceableFailure
+anyContractionCallback(transform_ext::MatchCallbackResult &res, Location loc,
+                       const mlir::transform::TransformState &state,
+                       ValueRange handles) {
+  if (handles.size() != 1 ||
+      !llvm::hasSingleElement(state.getPayloadOps(handles[0]))) {
+    return emitSilenceableFailure(loc)
+           << "expected one handle to one operation";
+  }
+
+  transform_ext::StructuredOpMatcher *pattern, *fill, *trailing;
+  transform_ext::MatchedMatmulCaptures ignore;
+  transform_ext::MatcherContext matcherContext;
+  transform_ext::makeAnyContractionMatcher(matcherContext, pattern, fill,
+                                           trailing, ignore,
+                                           /*mustMatchEntireFunc*/ true);
+
+  // TODO: need a mechanism for this to go around the entire IR,
+  // potentially with list matches for each group.
+  Operation *root = *state.getPayloadOps(handles[0]).begin();
+
+  WalkResult walkResult = root->walk([&](Operation *op) {
+    pattern->resetCapture();
+    if (!matchPattern(op, *pattern))
+      return WalkResult::advance();
+
+    // TODO: notify properly
+    LLVM_DEBUG({
+      DBGS() << "fill:" << fill->getCaptured() << "\n";
+      DBGS() << "pattern: " << pattern->getCaptured() << "\n";
+      if (trailing->getCaptured())
+        DBGS() << "trailing:" << trailing->getCaptured() << "\n";
+    });
+
+    res.addPayloadGroup({fill->getCaptured()});
+    res.addPayloadGroup({pattern->getCaptured()});
+    res.addPotentiallyEmptyPayloadGroup(trailing->getCaptured());
+    return WalkResult::interrupt();
+  });
+
+  if (walkResult.wasInterrupted())
+    return DiagnosedSilenceableFailure::success();
+  return emitSilenceableFailure(loc) << "failed to match batch matmul";
+}
+
 /// Match callback for a tensor.pad. Matches *the first* occurrence of such pad
 /// within an op associated with the given handle.
 ///
@@ -980,6 +1036,7 @@ DiagnosedSilenceableFailure transform_ext::RegisterMatchCallbacksOp::apply(
   registry.registerCallback("convolution", convolutionCallback);
   registry.registerCallback("matmul", matmulCallback);
   registry.registerCallback("batch_matmul", batchMatmulCallback);
+  registry.registerCallback("contraction", anyContractionCallback);
   registry.registerCallback("pad", wrapAsEntireFuncMatch(padCallback));
   registry.registerCallback("reduction",
                             wrapAsEntireFuncMatch(reductionCallback));
diff --git a/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp b/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp
index 322bd4d08aed..d7c0bc709aef 100644
--- a/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp
+++ b/llvm-external-projects/iree-dialects/lib/Transforms/TransformMatchers.cpp
@@ -1526,6 +1526,33 @@ void transform_ext::makeBatchMatmulMatcher(
     bmm = bmm.allTilableOpsCaptured<func::FuncOp>();
 }
 
+void transform_ext::makeAnyContractionMatcher(
+    transform_ext::MatcherContext &matcherContext,
+    transform_ext::StructuredOpMatcher *&dtmCapture,
+    transform_ext::StructuredOpMatcher *&fillCapture,
+    transform_ext::StructuredOpMatcher *&trailingCapture,
+    transform_ext::MatchedMatmulCaptures &captures, bool mustMatchEntireFunc) {
+  auto &dtm =
+      transform_ext::m_StructuredOp<linalg::GenericOp>(matcherContext)
+          .contractionDims(CaptureContractionDims(captures.contractionDims))
+          .dim(AllDims(), CaptureDims(captures.matmulOpSizes))
+          .input(NumEqualsTo(2))
+          .input(0, CaptureElementType(captures.lhsElementType))
+          .input(1, CaptureElementType(captures.rhsElementType))
+          .output(0, CaptureElementType(captures.outputElementType));
+  dtmCapture = &dtm;
+
+  auto &fill = transform_ext::m_StructuredOp<linalg::FillOp>(matcherContext);
+  dtm = dtm.output(0, fill);
+  fillCapture = &fill;
+
+  auto &trailing = m_StructuredOp<linalg::GenericOp>(matcherContext);
+  dtm = dtm.result(0, HasAnyUse(), trailing, OptionalMatch());
+  if (mustMatchEntireFunc)
+    dtm = dtm.allTilableOpsCaptured<func::FuncOp>();
+  trailingCapture = &trailing;
+}
+
 /// Match sum(%src, broadcast(%reduction))
 static void
 matchSubBroadcast(transform_ext::MatcherContext &matcherContext,

From f24c8440d5ded86bdd710686bebe03d43875430a Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Mon, 10 Jul 2023 08:04:26 -0400
Subject: [PATCH 08/12] [TransformStrategies] Add data tiled matmul strategy
 and cleanup other strategies

Maps data tiled matmuls to tensor core, assuming no distribution is
expected to happen over the inner tile.
---
 .../TransformStrategies/Common/Common.cpp     |   1 +
 .../GPU/AbstractGemmLikeStrategy.h            |   1 +
 .../TransformStrategies/GPU/BUILD.bazel       |   2 +
 .../TransformStrategies/GPU/CMakeLists.txt    |   2 +
 .../TransformStrategies/GPU/Common.cpp        |  17 +-
 .../GPU/ConvolutionTensorCoreStrategy.cpp     |  33 +-
 .../GPU/ConvolutionTensorCoreStrategy.h       |  12 +-
 .../GPU/DataTiledMatmulStrategy.cpp           | 284 ++++++++++++++++++
 .../GPU/DataTiledMatmulStrategy.h             | 183 +++++++++++
 .../TransformStrategies/GPU/Strategies.cpp    |  64 ++++
 .../TransformStrategies/GPU/Strategies.h      |   9 +
 .../iree/compiler/Codegen/Utils/GPUUtils.cpp  |   2 +-
 12 files changed, 594 insertions(+), 16 deletions(-)
 create mode 100644 compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/DataTiledMatmulStrategy.cpp
 create mode 100644 compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/DataTiledMatmulStrategy.h

diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
index ac55b3d7d843..5ad82faf68e8 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
@@ -109,6 +109,7 @@ void mlir::iree_compiler::createTransformRegion(
   (void)sequence;
   LDBG("transformation script:\n");
   LDBG("verification: " << sequence.verify().succeeded() << "\n");
+  LLVM_DEBUG(sequence.dump());
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h
index 691c1f656068..5f459648834b 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h
@@ -114,6 +114,7 @@ struct AbstractGemmLikeStrategy : GPUStrategy {
 
   virtual bool hasLhsCopy() const { return true; }
   virtual bool hasRhsCopy() const { return true; }
+  virtual bool hasResCopy() const { return true; }
 
   virtual MappingInfo lhsCopyMapping() const = 0;
   virtual LogicalResult validateLhsCopyMapping() const = 0;
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel
index 8c9731678085..857a14a148a5 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/BUILD.bazel
@@ -21,6 +21,7 @@ iree_compiler_cc_library(
         "ConvolutionStrategy.cpp",
         "ConvolutionTensorCoreStrategy.cpp",
         "CopyMapping.cpp",
+        "DataTiledMatmulStrategy.cpp",
         "MappingInfo.cpp",
         "MatmulTensorCoreStrategy.cpp",
         "PadStrategy.cpp",
@@ -35,6 +36,7 @@ iree_compiler_cc_library(
         "ConvolutionStrategy.h",
         "ConvolutionTensorCoreStrategy.h",
         "CopyMapping.h",
+        "DataTiledMatmulStrategy.h",
         "MappingInfo.h",
         "MatmulTensorCoreStrategy.h",
         "PadStrategy.h",
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt
index d00bd8e2bfd1..e33719f75763 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CMakeLists.txt
@@ -20,6 +20,7 @@ iree_cc_library(
     "ConvolutionStrategy.h"
     "ConvolutionTensorCoreStrategy.h"
     "CopyMapping.h"
+    "DataTiledMatmulStrategy.h"
     "MappingInfo.h"
     "MatmulTensorCoreStrategy.h"
     "PadStrategy.h"
@@ -33,6 +34,7 @@ iree_cc_library(
     "ConvolutionStrategy.cpp"
     "ConvolutionTensorCoreStrategy.cpp"
     "CopyMapping.cpp"
+    "DataTiledMatmulStrategy.cpp"
     "MappingInfo.cpp"
     "MatmulTensorCoreStrategy.cpp"
     "PadStrategy.cpp"
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
index 0e1a705f227a..6021a830e040 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Common.cpp
@@ -430,17 +430,22 @@ mlir::iree_compiler::gpu::buildDistributeMatmulCopies(
         variantH, tensor::ParallelInsertSliceOp::getOperationName());
     copyBackOpH = b.create<transform::InsertSliceToCopyOp>(
         insertSliceH.getType(), insertSliceH);
-  } else {
+  } else if (strategy.hasResCopy()) {
     Value resH = b.create<transform::GetProducerOfOperand>(
         paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(2));
     copyBackOpH =
         b.create<RewriteInDestinationPassingStyleOp>(resH.getType(), resH);
   }
 
-  Value lhsH = b.create<transform::GetProducerOfOperand>(
-      paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(0));
-  Value rhsH = b.create<transform::GetProducerOfOperand>(
-      paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(1));
+  Value lhsH, rhsH;
+  if (strategy.hasLhsCopy()) {
+    lhsH = b.create<transform::GetProducerOfOperand>(
+        paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(0));
+  }
+  if (strategy.hasRhsCopy()) {
+    rhsH = b.create<transform::GetProducerOfOperand>(
+        paddedMatmulOpH.getType(), paddedMatmulOpH, b.getI64IntegerAttr(1));
+  }
 
   // Rewrite aligned pads as destination passing (linalg.copy)
   if (strategy.alignedLhs() && strategy.packingDimensions[0] &&
@@ -600,7 +605,7 @@ Value mlir::iree_compiler::gpu::buildConvertToTensorCoreOp(
   } /* else nothing to do for fma here */
 
   // Post-hoc elimiation of barriers.
-  funcH = b.create<EliminateGpuBarriersOp>(funcH);
+  // funcH = b.create<EliminateGpuBarriersOp>(funcH);
   return funcH;
 }
 
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp
index eab23f959e76..6213091ee658 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp
@@ -46,6 +46,8 @@ using iree_compiler::gpu::buildHoistOutputPaddingOp;
 using iree_compiler::gpu::DataTiledConvolutionStrategy;
 using iree_compiler::gpu::MappingInfo;
 using iree_compiler::gpu::scaleUpByBitWidth;
+using iree_compiler::IREE::transform_dialect::
+    ApplyFoldReshapeIntoTensorHalInterfacePatternsOp;
 using iree_compiler::IREE::transform_dialect::EliminateGpuBarriersOp;
 using iree_compiler::IREE::transform_dialect::
     IREEPopulateWorkgroupCountRegionUsingNumThreadsSliceOp;
@@ -63,8 +65,14 @@ void DataTiledConvolutionStrategy::initDefaultValues(const GPUModel &gpuModel) {
   // Pull in tile configs from flags.
   AbstractGemmLikeStrategy::initDefaultValues(gpuModel);
   if (!cliOptionsSpecified) {
+    numThreads = SmallVector<int64_t>{32, 1, 1};
+    numWarps = SmallVector<int64_t>{1, 1, 1};
+    blockTileSizes[0] = 64;
     blockTileSizes[1] = 1;
-    while (m() % blockTileSizes[0]) {
+    while (
+        captures
+            .convolutionOpSizes[captures.convolutionDims.outputImage.back()] %
+        blockTileSizes[0]) {
       blockTileSizes[0] /= 2;
     }
     useWmma = true;
@@ -104,7 +112,7 @@ buildDataTiledConvolutionStrategyBlockDistribution(
       buildTileFuseDistToForallWithTileSizes(
           /*builder=*/b,
           /*variantH=*/variantH,
-          /*rootH=*/convH,
+          /*rootH=*/fusionTargetH,
           /*opsToFuseH=*/fusionGroupH,
           /*tileSizes=*/
           getAsOpFoldResult(b.getI64ArrayAttr(blockMapping.tileSizes)),
@@ -178,6 +186,8 @@ static void buildCommonConvolutionLikeThreadSchedule(
   // Step 5. Tile the filter loop dimensions.
   SmallVector<int64_t> tileSizes(
       strategy.captures.convolutionDims.outputChannel.size(), 0);
+  tileSizes.append(strategy.captures.convolutionDims.inputChannel.size() - 1,
+                   0);
   tileSizes.append(strategy.captures.convolutionDims.outputImage.size(), 0);
   tileSizes.append(strategy.captures.convolutionDims.filterLoop.size(), 1);
 
@@ -194,12 +204,25 @@ static void buildCommonConvolutionLikeThreadSchedule(
       b, variantH, filterTiledConvH, ValueRange(),
       getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
       b.getArrayAttr(computeMapping.threadMapping));
+
+  // Step 6.5 Distribute to threads: SIMT programming model.
+  MappingInfo resCopyMapping = strategy.resCopyMapping();
+  fillOpH = b.create<MatchOp>(variantH, linalg::FillOp::getOperationName());
   buildTileFuseDistToForallWithNumThreads(
       b, variantH, fillOpH, ValueRange(),
-      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
-      b.getArrayAttr(computeMapping.threadMapping));
+      getAsOpFoldResult(b.getI64ArrayAttr(resCopyMapping.numThreads)),
+      b.getArrayAttr(resCopyMapping.threadMapping));
+  buildTileFuseDistToForallWithNumThreads(
+      b, variantH, trailingH, ValueRange(),
+      getAsOpFoldResult(b.getI64ArrayAttr(resCopyMapping.numThreads)),
+      b.getArrayAttr(resCopyMapping.threadMapping));
 
   // Step 7. Apply vectorization + cleanups to what remains.
+  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
+    b.create<ApplyFoldReshapeIntoTensorHalInterfacePatternsOp>(loc);
+    b.create<transform::ApplyFoldUnitExtentDimsViaSlicesPatternsOp>(loc);
+    b.create<transform::ApplyCastAwayVectorLeadingOneDimPatternsOp>(loc);
+  });
   funcH = iree_compiler::buildVectorize(b, funcH, /*applyCleanups=*/true);
 
   // Step 8. Bufferize and drop HAL descriptor from memref ops.
@@ -214,7 +237,7 @@ static void buildCommonConvolutionLikeThreadSchedule(
                                 /*blockSize=*/strategy.numThreads,
                                 /*warpDims=*/strategy.numWarps,
                                 /*subgroupSize=*/strategy.targetSubgroupSize);
-  funcH = b.create<EliminateGpuBarriersOp>(funcH);
+  // funcH = b.create<EliminateGpuBarriersOp>(funcH);
 
   // Step 10. Convert to tensor core ops.
   // TODO: avoid consuming handles and returning here.
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h
index 4952904b2583..06ce123839d0 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h
@@ -108,6 +108,9 @@ class DataTiledConvolutionStrategy : public AbstractGemmLikeStrategy {
   bool hasLhsCopy() const override { return true; }
   // Filter is not copied.
   bool hasRhsCopy() const override { return false; }
+  bool hasResCopy() const override {
+    return captures.convolutionDims.inputChannel.size() == 2;
+  }
 
   MappingInfo getBlockMapping() const override {
     SmallVector<int64_t> tileSizes;
@@ -131,8 +134,8 @@ class DataTiledConvolutionStrategy : public AbstractGemmLikeStrategy {
     int64_t inputTileH =
         captures.convolutionOpSizes[captures.convolutionDims.filterLoop[0]];
     int64_t inputTileW =
-        captures.convolutionOpSizes[captures.convolutionDims.filterLoop[1]];
-    +blockTileM() - 1;
+        captures.convolutionOpSizes[captures.convolutionDims.filterLoop[1]] +
+        blockTileM() - 1;
     int64_t icInnerTileSize =
         captures
             .convolutionOpSizes[captures.convolutionDims.inputChannel.back()];
@@ -165,10 +168,11 @@ class DataTiledConvolutionStrategy : public AbstractGemmLikeStrategy {
     MappingInfo mapping = CopyMapping::getMappingInfo(
         ctx, totalNumThreads(),
         /*alignment=*/n(),
-        /*copySizes=*/ArrayRef<int64_t>{blockTileM(), blockTileN()},
+        /*copySizes=*/
+        ArrayRef<int64_t>{outputTileH, outputTileW, ocInnerTileSize},
         /*favorPredication=*/false,
         /*elementalBitWidth=*/resElementalBitWidth());
-    if (captures.convolutionDims.inputChannel.size() == 2) {
+    if (captures.convolutionDims.outputChannel.size() == 2) {
       mapping.tileSizes.insert(mapping.tileSizes.begin(), 1);
       mapping.numThreads.insert(mapping.numThreads.begin(), 0);
     }
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/DataTiledMatmulStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/DataTiledMatmulStrategy.cpp
new file mode 100644
index 000000000000..8dbbc55f6cb9
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/DataTiledMatmulStrategy.cpp
@@ -0,0 +1,284 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "iree/compiler/Codegen/TransformStrategies/GPU/DataTiledMatmulStrategy.h"
+
+#include "iree-dialects/Dialect/LinalgTransform/StructuredTransformOpsExt.h"
+#include "iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.h"
+#include "iree/compiler/Codegen/LLVMGPU/TransformExtensions/LLVMGPUExtensions.h"
+#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/MappingInfo.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/Dialect/Transform/IR/TransformAttrs.h"
+#include "mlir/Dialect/Transform/IR/TransformDialect.h"
+#include "mlir/Dialect/Transform/IR/TransformOps.h"
+#include "mlir/Dialect/Transform/IR/TransformTypes.h"
+#include "mlir/Dialect/Vector/TransformOps/VectorTransformOps.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
+
+using namespace mlir;
+
+#define DEBUG_TYPE "iree-transform-builder"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+// TODO: significantly better namespacing.
+using iree_compiler::buildPad;
+using iree_compiler::buildSelectFirstNonEmpty;
+using iree_compiler::buildTileFuseDistToForallWithNumThreads;
+using iree_compiler::buildTileFuseDistToForallWithTileSizes;
+using iree_compiler::TileToForallAndFuseAndDistributeResult;
+using iree_compiler::gpu::BatchMatmulStrategy;
+using iree_compiler::gpu::buildBufferize;
+using iree_compiler::gpu::buildConvertToAsyncCopies;
+using iree_compiler::gpu::buildConvertToTensorCoreOp;
+using iree_compiler::gpu::buildDistributeMatmulCopies;
+using iree_compiler::gpu::buildHoistOutputPaddingOp;
+using iree_compiler::gpu::buildMatmulVectorization;
+using iree_compiler::gpu::buildMultiBuffering;
+using iree_compiler::gpu::buildPipelineSharedMemoryCopies;
+using iree_compiler::gpu::DataTiledMatmulStrategy;
+using iree_compiler::gpu::MappingInfo;
+using iree_compiler::gpu::scaleUpByBitWidth;
+using iree_compiler::IREE::transform_dialect::
+    ApplyFoldReshapeIntoTensorHalInterfacePatternsOp;
+using iree_compiler::IREE::transform_dialect::EliminateGpuBarriersOp;
+using iree_compiler::IREE::transform_dialect::
+    IREEPopulateWorkgroupCountRegionUsingNumThreadsSliceOp;
+using transform::FuseIntoContainingOp;
+using transform::MatchOp;
+using transform_ext::RegisterMatchCallbacksOp;
+
+void DataTiledMatmulStrategy::initDefaultValues(const GPUModel &gpuModel) {
+  // Set the configuration for padding the matmul.
+  paddingValueTypes = {captures.lhsElementType, captures.rhsElementType,
+                       captures.outputElementType};
+  paddingDimensions = {0, 1, 2};
+  packingDimensions = {1, 1, 1};
+
+  // Pull in tile configs from flags.
+  AbstractGemmLikeStrategy::initDefaultValues(gpuModel);
+
+  // Data tiled strategies have specific requirements so adjust here.
+
+  // Consolidate the warps/threads along X.
+  numWarps[0] *= numWarps[1];
+  numWarps[1] = 1;
+  numThreads[0] *= numThreads[1];
+  numThreads[1] *= 1;
+  // BlockTileN is effectively the inner tile.
+  blockTileSizes[1] = captures.matmulOpSizes[captures.contractionDims.n.back()];
+  // Adjust downwards to force alignment along M.
+  while (m() % blockTileSizes[0]) {
+    blockTileSizes[0] /= 2;
+    if (numWarps[0] > 1) {
+      numWarps[0] /= 2;
+      numThreads[0] /= 2;
+    }
+  }
+  // Reduction tile size is unused.
+  reductionTileSize = 1;
+  // Force wmma.
+  useWmma = true;
+  useMmaSync = false;
+  useFma = false;
+  // Disable pipelining.
+  useAsyncCopies = false;
+  pipelineDepth = 0;
+  if (gpuModel.minSubgroupSize)
+    targetSubgroupSize = *gpuModel.minSubgroupSize;
+}
+
+LLVM_DUMP_METHOD void DataTiledMatmulStrategy::dump() const {
+  print(llvm::errs());
+}
+
+void DataTiledMatmulStrategy::print(llvm::raw_ostream &os) const {
+  os << "\n--- Data Tiled Matmul strategy ---\n";
+  AbstractGemmLikeStrategy::print(os);
+}
+
+// TODO: Implement a validator.
+LogicalResult
+DataTiledMatmulStrategy::validate(const GPUModel &gpuModel) const {
+  return success();
+}
+
+static std::tuple<Value, Value, Value, Value>
+buildDataTiledMatmulStrategyBlockDistribution(
+    ImplicitLocOpBuilder &b, Value variantH,
+    const DataTiledMatmulStrategy &strategy) {
+  // Step 1. Call the matcher. Note that this is the same matcher as used to
+  // trigger this compilation path, so it must always apply.
+  b.create<RegisterMatchCallbacksOp>();
+  auto [fillH, matmulH, maybeTrailingH] = unpackRegisteredMatchCallback<3>(
+      b, "contraction", transform::FailurePropagationMode::Propagate, variantH);
+
+  // Step 2. Create the block/mapping tiling level and fusee.
+  auto [fusionTargetH, fusionGroupH] =
+      buildSelectFirstNonEmpty(b, maybeTrailingH, matmulH);
+  MappingInfo blockMapping = strategy.getBlockMapping();
+  TileToForallAndFuseAndDistributeResult tileResult =
+      buildTileFuseDistToForallWithTileSizes(
+          /*builder=*/b,
+          /*variantH=*/variantH,
+          /*rootH=*/fusionTargetH,
+          /*opsToFuseH=*/fusionGroupH,
+          /*tileSizes=*/
+          getAsOpFoldResult(b.getI64ArrayAttr(blockMapping.tileSizes)),
+          /*threadDimMapping=*/
+          b.getArrayAttr(blockMapping.threadMapping));
+
+  auto [blockMatmulH, maybeBlockTrailingH] = buildSelectFirstNonEmpty(
+      b, tileResult.resultingFusedOpsHandles.front(), tileResult.tiledOpH);
+
+  Value fusedFillH =
+      b.create<FuseIntoContainingOp>(fillH, tileResult.forallH).getFusedOp();
+
+  // Handle the workgroup count region.
+  b.create<IREEPopulateWorkgroupCountRegionUsingNumThreadsSliceOp>(
+      tileResult.forallH);
+
+  // TODO: handle trailing op.
+  return std::make_tuple(fusedFillH, blockMatmulH, maybeBlockTrailingH,
+                         tileResult.forallH);
+}
+
+/// Builds the common part of the schedule for matmuls and batched matmuls.
+static void
+buildCommonMatmulLikeThreadSchedule(ImplicitLocOpBuilder &b, Value variantH,
+                                    Value fillH, Value matmulH, Value trailingH,
+                                    const DataTiledMatmulStrategy &strategy) {
+  using mlir::iree_compiler::buildLowerVectorMasksAndCleanup;
+  using mlir::iree_compiler::buildTileFuseToScfFor;
+  using namespace mlir::iree_compiler::gpu;
+
+  // Tile the reduction loop (last in the list).
+  SmallVector<int64_t> tileSizes(strategy.captures.matmulOpSizes.size() -
+                                     strategy.captures.contractionDims.k.size(),
+                                 0);
+  if (strategy.captures.contractionDims.k.size() == 2) {
+    tileSizes.push_back(1);
+  } else {
+    tileSizes.push_back(
+        strategy.captures
+            .matmulOpSizes[strategy.captures.contractionDims.k.back()]);
+  }
+
+  // Avoid canonicalizing before the pad to avoid folding away the extract_slice
+  // on the output needed to hoist the output pad.
+  auto tileReductionResult = buildTileFuseToScfFor(
+      b, variantH, matmulH, {}, getAsOpFoldResult(b.getI64ArrayAttr(tileSizes)),
+      /*canonicalize=*/false);
+
+  // Step 2. Pad the (batch) matmul op.
+  auto paddedMatmulOpH =
+      buildPad(b, tileReductionResult.tiledOpH,
+               strategy.getZeroPadAttrFromElementalTypes(b).getValue(),
+               strategy.paddingDimensions, strategy.packingDimensions);
+
+  // Step 3. Hoist the padding of the output operand above the reduction loop.
+  // The resulting fillOp will be mapped with the contraction using an SIMD
+  // programming model.
+  Value fillOpH = fillH;
+  if (!strategy.alignedRes()) {
+    fillOpH = buildHoistOutputPaddingOp(b, variantH, paddedMatmulOpH);
+  }
+
+  // Running canonicalization is required here to enable aligned pads to become
+  // linalg.copy ops when rewriting in DPS.
+  Value funcH =
+      b.create<transform::MatchOp>(variantH, func::FuncOp::getOperationName());
+  iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
+
+  // Step 4. Distribute pad and copies: SIMT programming model.
+  auto [lhsCopyOpH, rhsCopyOpH, copyBackOpH] =
+      buildDistributeMatmulCopies(b, variantH, paddedMatmulOpH, strategy);
+
+  // Step 5. Distribute to warps: SIMD programming model.
+  // TODO: get the number of warps from strategy.
+  MappingInfo computeMapping = strategy.computeMapping();
+  buildTileFuseDistToForallWithNumThreads(
+      b, variantH, paddedMatmulOpH, ValueRange(),
+      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
+      b.getArrayAttr(computeMapping.threadMapping));
+  buildTileFuseDistToForallWithNumThreads(
+      b, variantH, fillOpH, ValueRange(),
+      getAsOpFoldResult(b.getI64ArrayAttr(computeMapping.numThreads)),
+      b.getArrayAttr(computeMapping.threadMapping));
+
+  // Step 5.5. Distribute to threads: SIMT programming model.
+  MappingInfo resCopyMapping = strategy.resCopyMapping();
+  buildTileFuseDistToForallWithNumThreads(
+      b, variantH, trailingH, ValueRange(),
+      getAsOpFoldResult(b.getI64ArrayAttr(resCopyMapping.numThreads)),
+      b.getArrayAttr(resCopyMapping.threadMapping));
+
+  // Step 6. Rank-reduce and vectorize.
+  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
+    b.create<ApplyFoldReshapeIntoTensorHalInterfacePatternsOp>(loc);
+    b.create<transform::ApplyFoldUnitExtentDimsViaSlicesPatternsOp>(loc);
+    b.create<transform::ApplyCastAwayVectorLeadingOneDimPatternsOp>(loc);
+  });
+  buildMatmulVectorization(b, variantH, lhsCopyOpH, rhsCopyOpH, copyBackOpH,
+                           strategy);
+
+  // Step 7. Bufferize and drop HAL descriptor from memref ops.
+  variantH = buildBufferize(b, variantH);
+
+  // Step 8. Post-bufferization mapping to blocks and threads.
+  // Need to match again since bufferize invalidated all handles.
+  // TODO: assumes a single func::FuncOp to transform, needs hardening.
+  funcH = b.create<MatchOp>(variantH, func::FuncOp::getOperationName());
+  funcH =
+      buildMapToBlockAndThreads(b, funcH,
+                                /*blockSize=*/strategy.numThreads,
+                                /*warpDims=*/strategy.numWarps,
+                                /*subgroupSize=*/strategy.targetSubgroupSize);
+  funcH = b.create<EliminateGpuBarriersOp>(funcH);
+
+  // Step 9. Convert to tensor core ops.
+  // TODO: avoid consuming handles and returning here.
+  funcH = buildConvertToTensorCoreOp(b, funcH, strategy);
+
+  // TODO: Support pipelining strategies without async copy (e.g. store to
+  // shared memory in stage 0).
+  if (strategy.useAsyncCopies) {
+    // Step 10. Multi-buffering.
+    if (strategy.pipelineDepth > 1)
+      buildMultiBuffering(b, funcH, strategy);
+
+    // Step 11. Convert to async copies.
+    // TODO: avoid consuming handles and returning here.
+    funcH = buildConvertToAsyncCopies(b, funcH, strategy);
+
+    // Step 12. Pipeline shared memory copies.
+    if (strategy.pipelineDepth > 1)
+      buildPipelineSharedMemoryCopies(b, funcH, strategy);
+  }
+
+  // Step 13. Late lowerings and cleanups.
+  buildLowerVectorMasksAndCleanup(b, funcH);
+}
+
+void iree_compiler::gpu::buildDataTiledMatmulStrategy(
+    ImplicitLocOpBuilder &b, Value variantH,
+    const DataTiledMatmulStrategy &strategy) {
+  LLVM_DEBUG(strategy.print(DBGS()));
+
+  // Step 1. Apply block-level part of the strategy, keeps everything fused.
+  auto [fillH, matmulH, maybeTiledTrailingHBlock, forall] =
+      buildDataTiledMatmulStrategyBlockDistribution(b, variantH, strategy);
+  buildCommonMatmulLikeThreadSchedule(b, variantH, fillH, matmulH,
+                                      maybeTiledTrailingHBlock, strategy);
+}
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/DataTiledMatmulStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/DataTiledMatmulStrategy.h
new file mode 100644
index 000000000000..c5c1507bb6ed
--- /dev/null
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/DataTiledMatmulStrategy.h
@@ -0,0 +1,183 @@
+// Copyright 2023 The IREE Authors
+//
+// Licensed under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#ifndef IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_DATA_TILED_MATMUL_STRATEGY_H_
+#define IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_DATA_TILED_MATMUL_STRATEGY_H_
+
+#include "iree-dialects/Transforms/TransformMatchers.h"
+#include "iree/compiler/Codegen/TransformStrategies/Common/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/AbstractGemmLikeStrategy.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/Common.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h"
+#include "llvm/Support/raw_ostream.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Support/LogicalResult.h"
+#include "mlir/Support/MathExtras.h"
+
+namespace llvm {
+class raw_ostream;
+}
+
+namespace mlir {
+namespace iree_compiler {
+namespace gpu {
+
+struct GPUModel;
+
+class DataTiledMatmulStrategy : public AbstractGemmLikeStrategy {
+public:
+  DataTiledMatmulStrategy(MLIRContext *context,
+                          const transform_ext::MatchedMatmulCaptures &captures,
+                          const GPUModel &gpuModel)
+      : AbstractGemmLikeStrategy(gpuModel), ctx(context), captures(captures) {
+    initDefaultValues(gpuModel);
+  }
+
+  DataTiledMatmulStrategy(const DataTiledMatmulStrategy &) = default;
+  DataTiledMatmulStrategy &operator=(const DataTiledMatmulStrategy &) = default;
+
+  /// Constructor quantities.
+  MLIRContext *ctx;
+  transform_ext::MatchedMatmulCaptures captures;
+
+  /// Initialize values from the CLI. Set cliOptionsSpecified to true if the
+  /// default CLI values have been overriden.
+  void initDefaultValues(const GPUModel &gpuModel) override;
+
+  LogicalResult validate(const GPUModel &gpuModel) const override;
+
+  int64_t m() const override {
+    int64_t mElements = 1;
+    for (auto i : captures.contractionDims.m) {
+      mElements *= captures.matmulOpSizes[i];
+    }
+    return mElements;
+  }
+  int64_t n() const override {
+    int64_t nElements = 1;
+    for (auto i : captures.contractionDims.n) {
+      nElements *= captures.matmulOpSizes[i];
+    }
+    return nElements;
+  }
+  int64_t k() const override {
+    int64_t kElements = 1;
+    for (auto i : captures.contractionDims.k) {
+      kElements *= captures.matmulOpSizes[i];
+    }
+    return kElements;
+  }
+
+  int64_t blockTileM() const override { return blockTileSizes[0]; }
+  int64_t blockTileN() const override {
+    return captures.matmulOpSizes[captures.contractionDims.n.back()];
+  }
+
+  int64_t numWarpsX() const override { return numWarps[0]; }
+  int64_t numWarpsY() const override { return 1; }
+
+  Type getLhsElementalType() const override { return captures.lhsElementType; }
+  Type getRhsElementalType() const override { return captures.rhsElementType; }
+  Type getResElementalType() const override {
+    return captures.outputElementType;
+  }
+
+  MappingInfo getBlockMapping() const override {
+    SmallVector<int64_t> tileSizes;
+    SmallVector<Attribute> threadMapping = {blockX(ctx)};
+    // Outer output channel.
+    if (captures.contractionDims.n.size() == 2) {
+      tileSizes.push_back(1);
+      threadMapping = {blockY(ctx), blockX(ctx)};
+    }
+    tileSizes.push_back(blockTileM());
+    return MappingInfo{/*numThreads=*/{},
+                       /*tileSizes=*/tileSizes,
+                       /*threadMapping=*/threadMapping,
+                       /*vectorSize=*/std::nullopt};
+  }
+
+  // LHS copy is of size mxk.
+  MappingInfo lhsCopyMapping() const override {
+    int64_t kInnerTileSize =
+        captures.matmulOpSizes[captures.contractionDims.k.back()];
+    return CopyMapping::getMappingInfo(
+        ctx, totalNumThreads(),
+        /*alignment=*/k(),
+        /*copySizes=*/captures.contractionDims.k.size() == 2
+            ? ArrayRef<int64_t>{1, blockTileM(), kInnerTileSize}
+            : ArrayRef<int64_t>{blockTileM(), kInnerTileSize},
+        /*favorPredication=*/false,
+        /*elementalBitWidth=*/lhsElementalBitWidth());
+  }
+  // TODO: Implement validator.
+  LogicalResult validateLhsCopyMapping() const override { return success(); }
+
+  // RHS copy is of size kxn.
+  MappingInfo rhsCopyMapping() const override {
+    int64_t kInnerTileSize =
+        captures.matmulOpSizes[captures.contractionDims.k.back()];
+    int64_t nInnerTileSize =
+        captures.matmulOpSizes[captures.contractionDims.n.back()];
+    MappingInfo mapping = CopyMapping::getMappingInfo(
+        ctx, totalNumThreads(),
+        /*alignment=*/k(),
+        /*copySizes=*/ArrayRef<int64_t>{nInnerTileSize, kInnerTileSize},
+        /*favorPredication=*/false,
+        /*elementalBitWidth=*/rhsElementalBitWidth());
+    if (captures.contractionDims.n.size() == 2) {
+      mapping.tileSizes.insert(mapping.tileSizes.begin(), 1);
+      mapping.numThreads.insert(mapping.numThreads.begin(), 0);
+    }
+    if (captures.contractionDims.k.size() == 2) {
+      mapping.tileSizes.insert(mapping.tileSizes.begin(), 1);
+      mapping.numThreads.insert(mapping.numThreads.begin(), 0);
+    }
+    return mapping;
+  }
+  // TODO: Implement validator.
+  LogicalResult validateRhsCopyMapping() const override { return success(); }
+
+  // RES copy is of size mxn.
+  MappingInfo resCopyMapping() const override {
+    int64_t nInnerTileSize =
+        captures.matmulOpSizes[captures.contractionDims.n.back()];
+    return CopyMapping::getMappingInfo(
+        ctx, totalNumThreads(),
+        /*alignment=*/n(),
+        /*copySizes=*/captures.contractionDims.n.size() == 2
+            ? ArrayRef<int64_t>{1, blockTileM(), nInnerTileSize}
+            : ArrayRef<int64_t>{blockTileM(), nInnerTileSize},
+        /*favorPredication=*/false,
+        /*elementalBitWidth=*/resElementalBitWidth());
+  }
+  // TODO: Implement validator.
+  LogicalResult validateResCopyMapping() const override { return success(); }
+
+  // COMPUTE is of size mxn.
+  MappingInfo computeMapping() const override {
+    if (useFma) {
+      // When using FMA we don't need to map to warps, instead just match what
+      // the copy does.
+      return resCopyMapping();
+    }
+    return MappingInfo{/*numThreads=*/captures.contractionDims.n.size() == 2
+                           ? SmallVector<int64_t>{0, numWarpsX()}
+                           : SmallVector<int64_t>{numWarpsX()},
+                       /*tileSizes=*/{},
+                       /*threadMapping=*/{warpX(ctx)},
+                       /*vectorSize=*/std::nullopt};
+  }
+
+  void print(llvm::raw_ostream &os) const override;
+  LLVM_DUMP_METHOD void dump() const override;
+};
+
+} // namespace gpu
+} // namespace iree_compiler
+} // namespace mlir
+
+#endif // IREE_COMPILER_CODEGEN_TRANSFORM_DIALECT_STRATEGIES_GPU_DATA_TILED_MATMUL_STRATEGY_H_
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
index 5004b4864e13..d448ed511972 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
@@ -17,6 +17,7 @@
 #include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionImplicitGemmStrategy.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h"
+#include "iree/compiler/Codegen/TransformStrategies/GPU/DataTiledMatmulStrategy.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/MatmulTensorCoreStrategy.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/PadStrategy.h"
 #include "iree/compiler/Codegen/TransformStrategies/GPU/SmallReductionStrategy.h"
@@ -83,12 +84,18 @@ llvm::cl::opt<bool> clGPUEnableTransformDialectBatchMatmulStrategy(
     llvm::cl::desc("activate the batch matmul strategy, additional "
                    "configuration flags are shared with matmul"),
     llvm::cl::init(false));
+llvm::cl::opt<bool> clGPUEnableTransformDialectDataTiledMatmulStrategy(
+    "iree-codegen-llvmgpu-enable-transform-dialect-data-tiled-matmul-strategy",
+    llvm::cl::desc("activate the data tiled matmul strategy, additional "
+                   "configuration flags are shared with matmul"),
+    llvm::cl::init(true));
 
 // TODO: significantly better namespacing.
 using iree_compiler::gpu::AbstractGemmLikeStrategy;
 using iree_compiler::gpu::BatchMatmulStrategy;
 using iree_compiler::gpu::ConvolutionStrategy;
 using iree_compiler::gpu::DataTiledConvolutionStrategy;
+using iree_compiler::gpu::DataTiledMatmulStrategy;
 using iree_compiler::gpu::GPUModel;
 using iree_compiler::gpu::ImplicitGemmStrategy;
 using iree_compiler::gpu::kCudaMaxVectorLoadBitWidth;
@@ -551,6 +558,58 @@ static LogicalResult matchAndSetMatmulStrategy(func::FuncOp entryPoint,
   return success();
 }
 
+static DataTiledMatmulStrategy
+getDataTiledMatmulConfig(MLIRContext *context, MatchedMatmulCaptures &captures,
+                         const GPUModel &gpuModel) {
+  return DataTiledMatmulStrategy(context, captures, gpuModel);
+}
+
+/// Match the supported batch matmuls and set the transform dialect strategy for
+/// them.
+static LogicalResult
+matchAndSetDataTiledMatmulStrategy(func::FuncOp entryPoint, linalg::LinalgOp op,
+                                   const GPUModel &gpuModel) {
+  if (!clGPUEnableTransformDialectDataTiledMatmulStrategy) {
+    LDBG("--Data tiled matmul strategy flag turned off\n");
+    return failure();
+  }
+
+  StructuredOpMatcher *fill;
+  StructuredOpMatcher *dtm;
+  StructuredOpMatcher *trailing;
+  transform_ext::MatchedMatmulCaptures captures;
+  transform_ext::MatcherContext matcherContext;
+  transform_ext::makeAnyContractionMatcher(matcherContext, dtm, fill, trailing,
+                                           captures,
+                                           /*mustMatchEntireFunc=*/true);
+  if (!matchPattern(op, *dtm)) {
+    LDBG("--Data tiled matmul strategy failed to match\n");
+    return failure();
+  }
+
+  if (captures.contractionDims.batch.size() != 0 ||
+      captures.contractionDims.m.size() != 1 ||
+      (captures.contractionDims.n.size() != 2 &&
+       captures.contractionDims.k.size() != 2)) {
+    LDBG("--Data tiled matmul failed problem type check\n");
+    return failure();
+  }
+
+  DataTiledMatmulStrategy strategy =
+      getDataTiledMatmulConfig(entryPoint->getContext(), captures, gpuModel);
+  if (failed(strategy.validate(gpuModel))) {
+    LDBG("--Data tiled matmul strategy failed to validate\n");
+    return failure();
+  }
+
+  iree_compiler::createTransformRegion(
+      entryPoint, [&](ImplicitLocOpBuilder &b, Value variantH) {
+        return iree_compiler::gpu::buildDataTiledMatmulStrategy(b, variantH,
+                                                                strategy);
+      });
+  return success();
+}
+
 //===--------------------------------------------------------------------===//
 // Convolution strategies.
 //===--------------------------------------------------------------------===//
@@ -991,6 +1050,11 @@ LogicalResult mlir::iree_compiler::gpu::matchAndSetTransformStrategy(
     LDBG("Activate batch matmul\n");
     return success();
   }
+  if (succeeded(
+              matchAndSetDataTiledMatmulStrategy(entryPoint, linalgOp, gpuModel))) {
+    LDBG("Activate data tiled matmul\n");
+    return success();
+  }
   if (succeeded(matchAndSetDataTiledConvolutionStrategy(entryPoint, linalgOp,
                                                         gpuModel))) {
     LDBG("Activate convolution\n");
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
index bb2555092e56..f606b2336cd0 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.h
@@ -21,6 +21,7 @@ class BatchMatmulStrategy;
 class MatmulStrategy;
 class ConvolutionStrategy;
 class DataTiledConvolutionStrategy;
+class DataTiledMatmulStrategy;
 class PadStrategy;
 class SmallReductionStrategy;
 class StagedReductionStrategy;
@@ -90,6 +91,14 @@ void buildMatmulTensorCoreStrategy(ImplicitLocOpBuilder &b, Value variantH,
 void buildBatchMatmulStrategy(ImplicitLocOpBuilder &b, Value variantH,
                               const BatchMatmulStrategy &strategy);
 
+//===--------------------------------------------------------------------===//
+// Data tiled matmul strategies.
+//===--------------------------------------------------------------------===//
+/// Entry point to build the transform IR corresponding to an FMA-based strategy
+/// for linalg.fill + linalg.batch_matmul.
+void buildDataTiledMatmulStrategy(ImplicitLocOpBuilder &b, Value variantH,
+                                  const DataTiledMatmulStrategy &strategy);
+
 //===--------------------------------------------------------------------===//
 // Convolution strategies.
 //===--------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
index eca1e5ad4043..133f9a24b79c 100644
--- a/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
+++ b/compiler/src/iree/compiler/Codegen/Utils/GPUUtils.cpp
@@ -578,7 +578,7 @@ std::optional<SmallVector<int64_t>> getWmmaNativeVectorSize(Operation *op) {
   int64_t m = 16;
   int64_t n = 16;
   if (auto contract = dyn_cast<vector::ContractionOp>(op)) {
-    int64_t k = contract.getLhsType().getElementType().isF16() ? 16 : 8;
+    int64_t k = contract.getLhsType().getElementType().isF32() ? 8 : 16;
     SmallVector<int64_t> nativeSize(contract.getIteratorTypes().size() - 3, 1);
     nativeSize.append({m, n, k});
     return nativeSize;

From f5f46ef93a0cae2323fea698d5b6d44afaddfc7a Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Thu, 13 Jul 2023 16:32:03 -0400
Subject: [PATCH 09/12] [TransformStrategies] Enable fused tensor pad in conv
 strategy

Additionally improve distribution of pad copies for convolution strategy
by greedily distributing over the outer most dimensions of the copy.
---
 .../TransformStrategies/Common/Common.cpp     |  8 ++-
 .../TransformStrategies/Common/Common.h       |  3 +-
 .../GPU/ConvolutionStrategy.cpp               |  6 ++-
 .../GPU/ConvolutionTensorCoreStrategy.cpp     | 31 ++++++++++-
 .../GPU/ConvolutionTensorCoreStrategy.h       |  8 +--
 .../TransformStrategies/GPU/CopyMapping.cpp   | 51 ++++++++++++-------
 .../TransformStrategies/GPU/CopyMapping.h     |  6 ++-
 .../TransformStrategies/GPU/Strategies.cpp    |  4 +-
 8 files changed, 88 insertions(+), 29 deletions(-)

diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
index 5ad82faf68e8..aea31cfede85 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.cpp
@@ -291,7 +291,13 @@ Value mlir::iree_compiler::buildPad(
 Value mlir::iree_compiler::buildVectorize(ImplicitLocOpBuilder &b, Value funcH,
                                           bool applyCleanups,
                                           bool vectorizePadding,
-                                          bool vectorizeNdExtract) {
+                                          bool vectorizeNdExtract,
+                                          bool useIreePadHandling) {
+  if (useIreePadHandling) {
+    funcH = b.create<transform::ApplyRegisteredPassOp>(
+        funcH.getType(), funcH,
+        b.getStringAttr("iree-codegen-vectorize-tensor-pad"));
+  }
   funcH = b.create<VectorizeOp>(funcH, vectorizePadding, vectorizeNdExtract);
   if (applyCleanups) {
     iree_compiler::buildCanonicalizationAndEnablingTransforms(b, funcH);
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.h
index e0ff79c2017e..5f91b6010cdc 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/Common/Common.h
@@ -188,7 +188,8 @@ Value buildPad(ImplicitLocOpBuilder &b, Value opH,
 /// If `applyCleanups` is true, also apply cleanup patterns.
 Value buildVectorize(ImplicitLocOpBuilder &b, Value funcH,
                      bool applyCleanups = false, bool vectorizePadding = false,
-                     bool vectorizeNdExtract = false);
+                     bool vectorizeNdExtract = false,
+                     bool useIreePadHandling = false);
 
 /// Build transform IR that applies lowering of masked vector transfer
 /// operations and subsequent cleanup patterns (fold-memref-aliases).
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.cpp
index 3dc899d555a4..300110239a2b 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionStrategy.cpp
@@ -255,7 +255,11 @@ static void buildCommonConvolutionLikeThreadSchedule(
     b.create<transform::ApplyFoldUnitExtentDimsViaSlicesPatternsOp>(loc);
     b.create<transform::ApplyCastAwayVectorLeadingOneDimPatternsOp>(loc);
   });
-  funcH = iree_compiler::buildVectorize(b, funcH, /*applyCleanups=*/true);
+  funcH = iree_compiler::buildVectorize(b, funcH,
+                                        /*vectorizeNdExtract=*/false,
+                                        /*vectorizePadding=*/false,
+                                        /*useIreePadHandling=*/true,
+                                        /*applyCleanups=*/true);
 
   // Step 8. Bufferize and drop HAL descriptor from memref ops.
   variantH = buildBufferize(b, variantH);
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp
index 6213091ee658..c08d703a01ee 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.cpp
@@ -46,8 +46,12 @@ using iree_compiler::gpu::buildHoistOutputPaddingOp;
 using iree_compiler::gpu::DataTiledConvolutionStrategy;
 using iree_compiler::gpu::MappingInfo;
 using iree_compiler::gpu::scaleUpByBitWidth;
+using iree_compiler::IREE::transform_dialect::
+    ApplyCommonSubexpressionEliminationOp;
 using iree_compiler::IREE::transform_dialect::
     ApplyFoldReshapeIntoTensorHalInterfacePatternsOp;
+using iree_compiler::IREE::transform_dialect::
+    ApplySwapTensorPadWithExtractSliceOp;
 using iree_compiler::IREE::transform_dialect::EliminateGpuBarriersOp;
 using iree_compiler::IREE::transform_dialect::
     IREEPopulateWorkgroupCountRegionUsingNumThreadsSliceOp;
@@ -218,12 +222,37 @@ static void buildCommonConvolutionLikeThreadSchedule(
       b.getArrayAttr(resCopyMapping.threadMapping));
 
   // Step 7. Apply vectorization + cleanups to what remains.
+  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
+    b.create<ApplySwapTensorPadWithExtractSliceOp>(loc);
+  });
+  funcH = b.create<transform::ApplyRegisteredPassOp>(
+      funcH.getType(), funcH,
+      b.getStringAttr("iree-codegen-concretize-pad-result-shape"));
+  b.create<ApplyCommonSubexpressionEliminationOp>(funcH);
+  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
+    b.create<ApplySwapTensorPadWithExtractSliceOp>(loc);
+  });
+  funcH = b.create<transform::ApplyRegisteredPassOp>(
+      funcH.getType(), funcH,
+      b.getStringAttr("iree-codegen-concretize-pad-result-shape"));
+  b.create<ApplyCommonSubexpressionEliminationOp>(funcH);
+  b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
+    b.create<ApplySwapTensorPadWithExtractSliceOp>(loc);
+  });
+  funcH = b.create<transform::ApplyRegisteredPassOp>(
+      funcH.getType(), funcH,
+      b.getStringAttr("iree-codegen-concretize-pad-result-shape"));
+  b.create<ApplyCommonSubexpressionEliminationOp>(funcH);
   b.create<transform::ApplyPatternsOp>(funcH, [](OpBuilder &b, Location loc) {
     b.create<ApplyFoldReshapeIntoTensorHalInterfacePatternsOp>(loc);
     b.create<transform::ApplyFoldUnitExtentDimsViaSlicesPatternsOp>(loc);
     b.create<transform::ApplyCastAwayVectorLeadingOneDimPatternsOp>(loc);
   });
-  funcH = iree_compiler::buildVectorize(b, funcH, /*applyCleanups=*/true);
+  funcH = iree_compiler::buildVectorize(b, funcH,
+                                        /*vectorizeNdExtract=*/false,
+                                        /*vectorizePadding=*/false,
+                                        /*useIreePadHandling=*/true,
+                                        /*applyCleanups=*/true);
 
   // Step 8. Bufferize and drop HAL descriptor from memref ops.
   variantH = buildBufferize(b, variantH);
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h
index 06ce123839d0..6f9b7af73a4d 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/ConvolutionTensorCoreStrategy.h
@@ -144,8 +144,9 @@ class DataTiledConvolutionStrategy : public AbstractGemmLikeStrategy {
         /*alignment=*/k(),
         /*copySizes=*/
         ArrayRef<int64_t>{inputTileH, inputTileW, icInnerTileSize},
-        /*favorPredication=*/false,
-        /*elementalBitWidth=*/lhsElementalBitWidth());
+        /*favorPredication=*/true,
+        /*elementalBitWidth=*/lhsElementalBitWidth(),
+        /*favorLazyOuterDistributing=*/true);
     if (captures.convolutionDims.inputChannel.size() == 2) {
       mapping.tileSizes.insert(mapping.tileSizes.begin(), 1);
       mapping.numThreads.insert(mapping.numThreads.begin(), 0);
@@ -171,7 +172,8 @@ class DataTiledConvolutionStrategy : public AbstractGemmLikeStrategy {
         /*copySizes=*/
         ArrayRef<int64_t>{outputTileH, outputTileW, ocInnerTileSize},
         /*favorPredication=*/false,
-        /*elementalBitWidth=*/resElementalBitWidth());
+        /*elementalBitWidth=*/resElementalBitWidth(),
+        /*favorLazyOuterDistributing=*/false);
     if (captures.convolutionDims.outputChannel.size() == 2) {
       mapping.tileSizes.insert(mapping.tileSizes.begin(), 1);
       mapping.numThreads.insert(mapping.numThreads.begin(), 0);
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.cpp
index c3a957be348a..f71e2e50cc23 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.cpp
@@ -29,11 +29,10 @@ int64_t iree_compiler::gpu::CopyMapping::maxContiguousElementsToTransfer(
 }
 
 FailureOr<iree_compiler::gpu::CopyMapping>
-iree_compiler::gpu::CopyMapping::numThreadsForCopy(int totalNumThreads,
-                                                   int64_t alignment,
-                                                   ArrayRef<int64_t> sizes,
-                                                   bool favorPredication,
-                                                   int64_t elementalBitWidth) {
+iree_compiler::gpu::CopyMapping::numThreadsForCopy(
+    int totalNumThreads, int64_t alignment, ArrayRef<int64_t> sizes,
+    bool favorPredication, int64_t elementalBitWidth,
+    bool favorLazyOuterDistributing) {
   LDBG("\nSTART numThreadsForCopy, favorPredication: " << favorPredication);
   LLVM_DEBUG(llvm::interleaveComma(sizes, DBGS() << "--sizes: ");
              llvm::dbgs() << "\n";);
@@ -81,19 +80,35 @@ iree_compiler::gpu::CopyMapping::numThreadsForCopy(int totalNumThreads,
   SmallVector<int64_t> scaledSizes{sizes.begin(), sizes.end()};
   scaledSizes.back() /= actualVectorSize;
 
-  int64_t numThreadsRemaining = totalNumThreads;
-  LDBG("--numThreadsRemaining: " << numThreadsRemaining);
   SmallVector<int64_t> factors;
-  for (auto s : llvm::reverse(scaledSizes)) {
-    int64_t gcd = std::gcd(numThreadsRemaining, s);
-    factors.push_back(gcd);
-    numThreadsRemaining /= gcd;
-    LDBG("--new factors: " << gcd);
+  if (favorLazyOuterDistributing) {
+    int64_t numThreadsUsed = 1;
+    for (auto s : scaledSizes) {
+      int newThreads = 1;
+      for (auto maybeFactor : llvm::seq(1l, s + 1)) {
+        if (maybeFactor * numThreadsUsed > totalNumThreads)
+          break;
+        if (s % maybeFactor == 0)
+          newThreads = maybeFactor;
+      }
+      factors.push_back(newThreads);
+      numThreadsUsed *= newThreads;
+      LDBG("--new factors: " << newThreads);
+      LDBG("--numThreadsUsed: " << numThreadsUsed);
+    }
+  } else {
+    int64_t numThreadsRemaining = totalNumThreads;
     LDBG("--numThreadsRemaining: " << numThreadsRemaining);
+    for (auto s : llvm::reverse(scaledSizes)) {
+      int64_t gcd = std::gcd(numThreadsRemaining, s);
+      factors.push_back(gcd);
+      numThreadsRemaining /= gcd;
+      LDBG("--new factors: " << gcd);
+      LDBG("--numThreadsRemaining: " << numThreadsRemaining);
+    }
+    std::reverse(factors.begin(), factors.end());
   }
 
-  std::reverse(factors.begin(), factors.end());
-
   LLVM_DEBUG(llvm::interleaveComma(factors, DBGS() << "numThreads: ");
              llvm::dbgs() << "\n";
              LDBG("actualVectorSize: " << actualVectorSize););
@@ -104,12 +119,12 @@ iree_compiler::gpu::CopyMapping::numThreadsForCopy(int totalNumThreads,
 iree_compiler::gpu::MappingInfo iree_compiler::gpu::CopyMapping::getMappingInfo(
     MLIRContext *ctx, int totalNumThreads, int64_t alignment,
     ArrayRef<int64_t> copySizes, bool favorPredication,
-    int64_t elementalBitWidth) {
+    int64_t elementalBitWidth, bool favorLazyOuterDistributing) {
   assert(!copySizes.empty() && copySizes.size() <= 3 &&
          "only 1,2,3-D copies are supported for now");
-  FailureOr<CopyMapping> maybeCopyMapping =
-      CopyMapping::numThreadsForCopy(totalNumThreads, alignment, copySizes,
-                                     favorPredication, elementalBitWidth);
+  FailureOr<CopyMapping> maybeCopyMapping = CopyMapping::numThreadsForCopy(
+      totalNumThreads, alignment, copySizes, favorPredication,
+      elementalBitWidth, favorLazyOuterDistributing);
   // If failed, try again with predication; this must succeed.
   if (failed(maybeCopyMapping)) {
     assert(!favorPredication &&
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h
index 3918becc4d96..06e2f244a4b0 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/CopyMapping.h
@@ -57,7 +57,8 @@ struct CopyMapping {
   static FailureOr<CopyMapping>
   numThreadsForCopy(int totalNumThreads, int64_t alignment,
                     ArrayRef<int64_t> sizes, bool favorPredication,
-                    int64_t elementalBitWidth = 32);
+                    int64_t elementalBitWidth = 32,
+                    bool favorLazyOuterDistributing = false);
 
   /// Greedily compute the MappingInfo to use to perform a copy of `sizes`
   /// elements of bitwidth `elementalBitWidth`.
@@ -75,7 +76,8 @@ struct CopyMapping {
   static MappingInfo getMappingInfo(MLIRContext *ctx, int totalNumThreads,
                                     int64_t alignment, ArrayRef<int64_t> sizes,
                                     bool favorPredication = false,
-                                    int64_t elementalBitWidth = 32);
+                                    int64_t elementalBitWidth = 32,
+                                    bool favorLazyOuterDistributing = false);
 };
 
 } // namespace gpu
diff --git a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
index d448ed511972..c346de3619c0 100644
--- a/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
+++ b/compiler/src/iree/compiler/Codegen/TransformStrategies/GPU/Strategies.cpp
@@ -820,8 +820,8 @@ static LogicalResult matchAndSetDataTiledConvolutionStrategy(
     return failure();
   }
 
-  if (!fill->getCaptured() || pad->getCaptured()) {
-    LDBG("--Convolution strategy capture preconditions failed\n");
+  if (!fill->getCaptured()) {
+    LDBG("--Convolution strategy capture precondition failed\n");
     return failure();
   }
 

From 208ec512fa69bd04f3566ef125191d379e4705f2 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Thu, 13 Jul 2023 11:37:56 -0400
Subject: [PATCH 10/12] [TransformExtensions] Add apply patterns op for fusing
 pad with consumer

---
 .../Common/TransformExtensions/CommonExtensions.cpp |  6 ++++++
 .../TransformExtensions/CommonExtensionsOps.td      | 13 +++++++++++++
 2 files changed, 19 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
index 0ffc77e0f336..2dc5eb0b7aa1 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensions.cpp
@@ -269,6 +269,12 @@ void transform_dialect::ApplyPrepareVectorToMMAPatternsOp::populatePatterns(
   populatePrepareVectorToMMAPatterns(patterns, getUseNvGpu());
 }
 
+void transform_dialect::ApplySwapTensorPadWithExtractSliceOp::populatePatterns(
+    RewritePatternSet &patterns) {
+  patterns.insert<linalg::ExtractSliceOfPadTensorSwapPattern>(
+      patterns.getContext(), [](tensor::ExtractSliceOp) { return false; });
+}
+
 //===---------------------------------------------------------------------===//
 // ApplyCommonSubexpressionEliminationOp
 //===---------------------------------------------------------------------===//
diff --git a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
index 219e7426a3ed..c61cb9398da1 100644
--- a/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
+++ b/compiler/src/iree/compiler/Codegen/Common/TransformExtensions/CommonExtensionsOps.td
@@ -519,4 +519,17 @@ def IREEPopulateWorkgroupCountRegionUsingNumThreadsSliceOp :
   }];
 }
 
+def ApplySwapTensorPadWithExtractSliceOp : Op<Transform_Dialect,
+    "apply_patterns.iree.swap_tensor_pad",
+    [DeclareOpInterfaceMethods<PatternDescriptorOpInterface>,
+     ReportTrackingListenerFailuresOpTrait]> {
+  let description = [{
+    Populate patterns to swap tensor pad with consumer tensor.extract_slice
+    operations.
+  }];
+
+  let cppNamespace = "mlir::iree_compiler::IREE::transform_dialect";
+  let assemblyFormat = "attr-dict";
+}
+
 #endif // IREE_COMPILER_CODEGEN_COMMON_TRANSFORMEXTENSIONS_COMMONEXTENSIONS

From b4830b5a2de7fd10ec54cfe21a87416a81cac124 Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Mon, 10 Jul 2023 09:49:15 -0400
Subject: [PATCH 11/12] [Flow] Enable pad fusion on convolution interface

Currently pad fusion only applies to named convolutions. This allows it
to apply based on the interface.
---
 .../compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp
index b050654a8d8b..7ea303de8cc2 100644
--- a/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp
+++ b/compiler/src/iree/compiler/Dialect/Flow/Transforms/FormDispatchRegions.cpp
@@ -629,8 +629,11 @@ isFusableWithProducer(OpOperand &operand,
     return false;
   }
 
+  mlir::linalg::detail::ConvolutionDimensions ignore;
   if (options.fusePadWithConsumers && isa<tensor::PadOp>(producer) &&
-      isa<linalg::ConvolutionOpInterface>(consumer)) {
+      linalg::detail::getMatchConvolutionMessage(
+          linalg::detail::isConvolutionInterfaceImpl(consumer, &ignore))
+          .empty()) {
     return true;
   }
 

From 52fb67fbbd6668f8af4a4cac17a03df28189cffb Mon Sep 17 00:00:00 2001
From: Quinn Dawkins <quinn@nod-labs.com>
Date: Thu, 20 Jul 2023 11:51:35 -0400
Subject: [PATCH 12/12] Add pattern for bubbling vector.bitcast through an
 enclosing scf.if

<32 bit width types are handled on the SPIR-V side by introducing
bitcasts to and from i32 and bubbling them to the center of the kernel
hoping to cancel. This adds a pattern for a bitcast on the result of an
scf.if, which comes from the way that padding is handled (transfer_read
in the `then` branch, else yield a splat constant).
---
 .../Common/OptimizeVectorTransferPass.cpp     | 79 +++++++++++++++++++
 1 file changed, 79 insertions(+)

diff --git a/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp b/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
index 66acf8cb670a..f26f3223f3ce 100644
--- a/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
+++ b/compiler/src/iree/compiler/Codegen/Common/OptimizeVectorTransferPass.cpp
@@ -46,6 +46,84 @@ class TransposeUnitDimToShapeCast
   }
 };
 
+// TODO: Move this upstream
+// Hoists a vector.bitcast op to the output of the enclosing scf.if
+//
+// This transforms IR like:
+//   %0 = scf.if %1 -> (vector<16xi8>) {
+//     %2 = memref.load %4[%c0] : memref<?xvector<4xi32>>
+//     %3 = vector.bitcast %2 : vector<4xi32> to vector<16xi8>
+//     scf.yield %3 : vector<16xi8>
+//   } else {
+//     scf.yield %cst : vector<16xi8>
+//   }
+// Into:
+//   %0 = scf.if %1 -> (vector<4xi32>) {
+//     %2 = memref.load %4[%c0] : memref<?xvector<4xi32>>
+//     scf.yield %2 : vector<4xi32>
+//   } else {
+//     %3 = vector.bitcast %cst : vector<16xi8> to vector<4xi32>
+//     scf.yield %0 : vector<4xi32>
+//   }
+//   %3 = vector.bitcast %0 : vector<4xi32> to vector<16xi8>
+struct BubbleUpBitCastOfScfIf : public OpRewritePattern<scf::IfOp> {
+  using OpRewritePattern::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(scf::IfOp ifOp,
+                                PatternRewriter &rewriter) const override {
+    // Bail on more than one result for now.
+    scf::YieldOp thenYield = ifOp.thenYield();
+    if (!thenYield || thenYield.getNumOperands() != 1)
+      return failure();
+    auto bitcastOp = thenYield.getOperand(0).getDefiningOp<vector::BitCastOp>();
+    // Bail out if no bitcast on the if then statement.
+    if (!bitcastOp)
+      return failure();
+
+    VectorType castSrcType = bitcastOp.getSourceVectorType();
+    VectorType castDstType = bitcastOp.getResultVectorType();
+    assert(castSrcType.getRank() == castDstType.getRank());
+    // Skip 0-D vector.
+    if (castSrcType.getRank() == 0)
+      return failure();
+
+    int64_t castSrcLastDim = castSrcType.getShape().back();
+    int64_t castDstLastDim = castDstType.getShape().back();
+    // Require casting to more elements;
+    if (castSrcLastDim > castDstLastDim)
+      return failure();
+
+    Location loc = ifOp.getLoc();
+
+    auto bitcastedIfOp =
+        rewriter.create<scf::IfOp>(loc, castSrcType, ifOp.getCondition());
+    bitcastedIfOp.getThenRegion().takeBody(ifOp.getThenRegion());
+    bitcastedIfOp.getElseRegion().takeBody(ifOp.getElseRegion());
+
+    scf::YieldOp newThenYield = bitcastedIfOp.thenYield();
+    auto newBitcastOp =
+        newThenYield.getOperand(0).getDefiningOp<vector::BitCastOp>();
+
+    newThenYield.setOperand(0, newBitcastOp.getSource());
+
+    auto newBitcast = rewriter.create<vector::BitCastOp>(
+        loc, castDstType, bitcastedIfOp.getResult(0));
+
+    scf::YieldOp elseYield = bitcastedIfOp.elseYield();
+    if (elseYield) {
+      OpBuilder::InsertionGuard elseGuard(rewriter);
+      rewriter.setInsertionPoint(elseYield);
+
+      Value yieldSrc = elseYield.getOperand(0);
+      auto elseBitcast =
+          rewriter.create<vector::BitCastOp>(loc, castSrcType, yieldSrc);
+      elseYield.setOperand(0, elseBitcast);
+    }
+    rewriter.replaceOp(ifOp, newBitcast);
+    return success();
+  }
+};
+
 static void loopInvariantCodeMotion(func::FuncOp funcOp) {
   // Walk through all loops in a function in innermost-loop-first order. This
   // way, we first LICM from the inner loop, and place the ops in
@@ -89,6 +167,7 @@ struct OptimizeVectorTransferPass
     {
       RewritePatternSet patterns(&getContext());
       vector::populateBubbleVectorBitCastOpPatterns(patterns);
+      patterns.add<BubbleUpBitCastOfScfIf>(&getContext());
       if (failed(applyPatternsAndFoldGreedily(funcOp, std::move(patterns)))) {
         return signalPassFailure();
       }