ROCm · causten · Aug 21, 2024 · Apr 15, 2024 · Apr 15, 2024 · Apr 15, 2024
@@ -144,7 +144,7 @@ rocmtest clang_debug: rocmnode('mi100+') { cmake_build ->
     }
 }, mlir_debug: rocmnode('mi100+') { cmake_build ->
     stage('MLIR Debug') {
-        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1', 'MIGRAPHX_MLIR_USE_SPECIFIC_OPS=fused,attention,convolution,dot', 'MIGRAPHX_ENABLE_MLIR_INPUT_FUSION=1', 'MIGRAPHX_MLIR_ENABLE_SPLITK=1']) {
+        withEnv(['MIGRAPHX_ENABLE_EXTRA_MLIR=1', 'MIGRAPHX_MLIR_USE_SPECIFIC_OPS=fused,attention,convolution,dot', 'MIGRAPHX_ENABLE_MLIR_INPUT_FUSION=1', 'MIGRAPHX_MLIR_ENABLE_SPLITK=1', 'MIGRAPHX_ENABLE_MLIR_REDUCE_FUSION=1', 'MIGRAPHX_ENABLE_SPLIT_REDUCE=1','MIGRAPHX_DISABLE_LAYERNORM_FUSION=1']) {
             def sanitizers = "undefined"
             // Note: the -fno-sanitize= is copied from upstream LLVM_UBSAN_FLAGS.
             def debug_flags_cxx = "-g -O2 -fsanitize=${sanitizers} -fno-sanitize=vptr,function -fno-sanitize-recover=${sanitizers}"

@@ -278,6 +278,11 @@ Limits the number of solutions available to MLIR for tuning.
 Set to "1", "enable", "enabled", "yes", or "true" to use.
 Enable input fusions in MLIR.
 
+.. envvar:: MIGRAPHX_ENABLE_MLIR_REDUCE_FUSION
+
+Set to "1", "enable", "enabled", "yes", or "true" to use.
+Enable reduction fusions in MLIR.
+
 .. envvar:: MIGRAPHX_MLIR_ENABLE_SPLITK
 
 Set to "1", "enable", "enabled", "yes", or "true" to use.

@@ -1041,8 +1041,8 @@ module::fuse(const module& m,
     if(map_ins == nullptr)
         map_ins = &default_map_ins;
     insert_params(*this, inputs, *map_ins);
-    auto param_map = m.get_ins_param_map(inputs);
-    for(auto&& [input, param] : param_map)
+    auto param_map = m.get_ins_param_map(inputs, true);
+    for(auto&& [param, input] : param_map)
 copy_ins    = m.add_parameter(name, s); 
 copy_ins    = m.add_parameter(name, s); 
     {
         (*map_ins)[param] = map_ins->at(input);
     }

@@ -43,6 +43,7 @@ namespace gpu {
 
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_EXTRA_MLIR);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MLIR_INPUT_FUSION);
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_MLIR_REDUCE_FUSION);
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_MLIR);
 /**
  * @brief Declares a new MIGraphX environment variable which forces to generate
@@ -386,13 +387,59 @@ bool is_pointwise_op_supported_by_mlir(const instruction& i)
     return false;
 }
 
+bool is_reduce_op_supported_by_mlir(const instruction& i)
+{
+    using type_t                                      = shape::type_t;
+    const auto& name                                  = i.name();
+    const auto result_type                            = i.get_shape().type();
+    const std::initializer_list<type_t> allowed_types = {
+        type_t::float_type, type_t::half_type, type_t::fp8e4m3fnuz_type};
+    // Preliminary type check.
+    if(not contains(allowed_types, result_type))
+    {
+        return false;
+    }
+    const std::initializer_list<std::string> reduce_ops = {"reduce_mean", "reduce_sum"};
+    return contains(reduce_ops, i.name());
+}
+
 // A separate function so we can remove operators that are supported by mlir
 // but not supported for an input fusion.
 bool is_pointwise_op_supported_by_mlir_for_input(const instruction& i)
 {
     return is_pointwise_op_supported_by_mlir(i);
 }
 
+MIGRAPHX_PRED_MATCHER(mlir_split_reduce, instruction_ref ins)
+{
+    if(ins->name() != "split_fused_reduce")
+        return false;
+    auto* mod_arg           = ins->module_inputs().front();
+    auto supported_reshapes = reshaper_names();
+    supported_reshapes.erase("slice");
+    std::unordered_set<std::string> builtins = {"@param", "@literal", "@return"};
+    for(const auto i : iterator_for(*mod_arg))
+    {
+        if(is_reduce(*i))
+        {
+            if(not is_reduce_op_supported_by_mlir(*i))
+                return false;
+        }
+        else if(i->name() == "pointwise")
+        {
+            if(not std::all_of(i->module_inputs().front()->begin(),
+                               i->module_inputs().front()->end(),
+                               &is_pointwise_op_supported_by_mlir))
+                return false;
+        }
+        else if(not contains(reshaper_names(), i->name()) and not contains(builtins, i->name()))
+        {
+            return false;
+        }
+    }
+    return true;
+}
+
 MIGRAPHX_PRED_MATCHER(mlir_pointwise, instruction_ref ins)
 {
     if(ins->name() != "pointwise")
@@ -423,6 +470,100 @@ std::vector<instruction_ref> mlir_contiguous(module_pass_manager& mpm,
     return result;
 }
 
+struct find_mlir_split_reduce
+{
+    mlir_mode conv_mode = mlir_mode::none;
+    mlir_mode dot_mode  = mlir_mode::none;
+    auto matcher() const
+    {
+        auto dot_or_conv = match::name("gpu::mlir_op");
+        // TODO: Handle reshapes inbetween
+        return mlir_split_reduce()(match::any_of[match::inputs()](dot_or_conv.bind("gemm")));
+    }
+
+    void apply(module_pass_manager& mpm, const match::matcher_result& r) const
+    {
+        auto reduce_ins = r.result;
+        auto gemm_ins   = r.instructions["gemm"];
+        assert(gemm_ins->get_shape().sub_shapes().empty());
+        auto* rm   = reduce_ins->module_inputs().front();
+        auto names = rm->get_parameter_names();
+        std::sort(names.begin(), names.end());
+        module_ref gemm_old_mm = gemm_ins->module_inputs().front();
+        module_ref mm =
+            mpm.create_module(gemm_old_mm->name() + "_split_fused_reduce", *gemm_old_mm);
+        // remove last return instruction
+        if(std::prev(mm->end())->name() == "@return")
+        {
+            mm->remove_instruction(std::prev(mm->end()));
+        }
+        mm->set_bypass();
+        std::unordered_map<instruction_ref, instruction_ref> param_map;
+        param_map[gemm_ins]      = std::prev(mm->end());
+        bool gemm_has_multi_outs = gemm_ins->outputs().size() > 1;
+        auto return_vals =
+            mm->fuse(*rm,
+                     reduce_ins->inputs(),
+                     &param_map,
+                     [&](module& main_mod,
+                         instruction_ref pos,
+                         const operation& op,
+                         const std::vector<instruction_ref>& inputs,
+                         const std::vector<module_ref>& mod_args) {
+                         if(op.name() == "pointwise")
+                         {
+                             for(const auto& skip_param : inputs)
+                             {
+                                 if(not contains(param_map, skip_param))
+                                 {
+                                     param_map[skip_param] =
+                                         skip_param; // skip adding parameter for inputs of
+                                                     // pointwise inside split_fused_reduce
+                                 }
+                             }
 (*map_ins)[param] = map_ins->at(input); 
 copy_ins = insert(m, ins, sins->get_operator(), copy_inputs, mod_args); 
 if(contains(map_ins, input)) 
 (*map_ins)[param] = map_ins->at(input); 
 copy_ins = insert(m, ins, sins->get_operator(), copy_inputs, mod_args); 
 if(contains(map_ins, input)) 
+                             auto* sub_pm     = mod_args.front();
+                             auto param_map_2 = create_param_map_with_literals(
+                                 &main_mod, sub_pm, op.compute_shape(to_shapes(inputs), mod_args));
+                             param_map.insert(param_map_2.begin(), param_map_2.end());
+                             return main_mod.fuse(*sub_pm, inputs, &param_map).front();
+                         }
+                         return main_mod.insert_instruction(pos, op, inputs, mod_args);
+                     });
+        if(gemm_has_multi_outs)
+        {
+            return_vals.insert(return_vals.end(), param_map[gemm_ins]);
+        }
+        mm->add_return(return_vals);
+        std::vector<instruction_ref> inputs;
+        std::copy_if(reduce_ins->inputs().begin(),
+                     reduce_ins->inputs().end(),
+                     std::back_inserter(inputs),
+                     [&](auto input) { return input != gemm_ins; });
+        inputs.insert(inputs.end(), gemm_ins->inputs().begin(), gemm_ins->inputs().end());
+        if(gemm_has_multi_outs)
+        {
+            auto fused_ins = mpm.get_module().insert_instruction(
+                reduce_ins, mlir_op{gemm_ins->get_operator()}, mlir_contiguous(mpm, inputs), {mm});
+            auto dot_ins = mpm.get_module().insert_instruction(
+                reduce_ins,
+                migraphx::make_op("get_tuple_elem", {{"index", return_vals.size() - 1}}),
+                fused_ins);
+
+            mpm.get_module().replace_instruction(gemm_ins, dot_ins);
+            for(const auto outs : reduce_ins->outputs())
+            {
+                assert(outs->get_operator().name() == "get_tuple_elem");
+                mpm.get_module().replace_instruction(outs, outs->get_operator(), fused_ins);
+            }
+        }
+        else
+        {
+            mpm.get_module().replace_instruction(
+                reduce_ins, mlir_op{gemm_ins->get_operator()}, mlir_contiguous(mpm, inputs), {mm});
+        }
+    }
+};
+
 struct find_mlir_fused_ops
 {
     mlir_mode conv_mode = mlir_mode::none;
@@ -714,15 +855,25 @@ void fuse_mlir::apply(module_pass_manager& mpm) const
         mpm,
         find_mlir_fused_ops{.conv_mode = get_mode("fused_convolution", mlir_mode::fast),
                             .dot_mode  = get_mode("fused_dot", mlir_mode::fast)});
+
     match::find_matches(
         mpm,
         find_mlir_standalone_convolution_op{get_mode("convolution", mlir_mode::fast)},
         find_mlir_standalone_dot_op{get_mode("dot", mlir_mode::fast)});
 
     mpm.run_pass(dead_code_elimination{});
+    if(enabled(MIGRAPHX_ENABLE_MLIR_REDUCE_FUSION{}))
+    {
+        match::find_matches(
+            mpm,
+            find_mlir_split_reduce{.conv_mode = get_mode("fused_convolution", mlir_mode::fast),
+                                   .dot_mode  = get_mode("fused_dot", mlir_mode::fast)});
+    }
 
     if(enabled(MIGRAPHX_ENABLE_MLIR_INPUT_FUSION{}))
+    {
         match::find_matches(mpm, find_pointwise_mlir{});
+    }
 #else
     (void)mpm;
 #endif

@@ -50,6 +50,8 @@ struct MIGRAPHX_GPU_EXPORT mlir_code_object
     std::vector<value> prefill_values   = {};
 };
 
+MIGRAPHX_GPU_EXPORT bool is_reduce(const instruction& ins);
+
 MIGRAPHX_GPU_EXPORT mlir_code_object compile_mlir(const context& migraphx_ctx,
                                                   module m,
                                                   const std::vector<shape>& in_shapes,

@@ -99,7 +99,14 @@ struct mlir_compiler : compiler<mlir_compiler>
             dot_mlir_inputs.push_back(mod_splits[0].mod.get_output_shapes().front());
             mlir_code_object cop1 = compile_mlir(ctx, mod_splits[0].mod, dot_mlir_inputs, solution);
             auto pw_shapes        = to_shapes(mod_splits[1].inputs);
-            pw_shapes.push_back(mod_splits[1].mod.get_output_shapes().front());
+            if(mod_splits[1].mod.get_output_shapes().size() == 1)
+            {
+                pw_shapes.push_back(mod_splits[1].mod.get_output_shapes().front());
+            }
+            else
+            {
+                pw_shapes.push_back(shape{mod_splits[1].mod.get_output_shapes()});
+            }
             assert(pw_shapes.back() == ins->get_shape());
             auto pw_mod                        = create_pointwise_module(&mod_splits[1].mod);
             auto cop2                          = compile_pointwise(ctx, pw_shapes, &pw_mod);

@@ -21,11 +21,16 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include <algorithm>
+#include <cstdint>
 #include <migraphx/algorithm.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/pass_manager.hpp>
 #include <migraphx/gpu/mlir.hpp>
 #include <mlir-c/Dialect/RockEnums.h>
+#include <numeric>
 #include <ostream>
 
 #ifdef MIGRAPHX_MLIR
@@ -951,11 +956,60 @@ struct mlir_program
     std::string sym_name;
 };
 
+bool is_reduce(const instruction& ins) { return contains(ins.name(), "reduce"); }
+
+static void rewrite_reduce(module& m)
+{
+    for(auto i : iterator_for(m))
+    {
+        if(is_reduce(*i))
+        {
+            auto reduce_op   = i->get_operator().to_value();
+            auto reduce_axes = reduce_op["axes"].to_vector<size_t>();
+            auto reduce_lens = i->get_shape().lens();
+            auto in_shape    = i->inputs().front()->get_shape();
+            auto in_lens     = in_shape.lens();
+            assert(in_shape.standard());
+            assert(reduce_lens.size() == in_lens.size());
+            assert(std::adjacent_find(
+                       reduce_axes.begin(), reduce_axes.end(), [](auto axis_1, auto axis_2) {
+                           return axis_2 - axis_1 > 1;
+                       }) == reduce_axes.end());
+
+            std::vector<int64_t> new_rsp_dims;
+            std::vector<int64_t> new_reduce_axes;
+            for(const auto axis : range(in_shape.ndim()))
+            {
+                if(reduce_lens[axis] == in_lens[axis])
+                {
+                    new_rsp_dims.push_back(in_lens[axis]);
+                }
+                else if(new_reduce_axes.empty())
+                {
+                    assert(reduce_lens[axis] == 1);
+                    new_rsp_dims.push_back(-1);
+                    new_reduce_axes.push_back(axis);
+                }
+            }
+            auto rsp_ins = m.insert_instruction(
+                i, migraphx::make_op("reshape", {{"dims", new_rsp_dims}}), i->inputs().front());
+            auto collapsed_reduce = m.insert_instruction(
+                i, migraphx::make_op("reduce_sum", {{"axes", new_reduce_axes}}), rsp_ins);
+            auto rsp_back = m.insert_instruction(
+                i, migraphx::make_op("reshape", {{"dims", reduce_lens}}), collapsed_reduce);
+            m.replace_instruction(i, rsp_back);
+        }
+    }
+    migraphx::run_passes(m, {migraphx::dead_code_elimination{}});
+}
+
 bool is_module_fusible(const module& m, const context& migraphx_ctx, const value& solution)
 {
+    auto mm = m;
+    rewrite_reduce(mm);
     mlir_program mp;
     mp.set_gpu_properties(migraphx_ctx);
-    mp.parse(m);
+    mp.parse(mm);
     mp.run_high_level_pipeline();
     return mlirIsModuleFusible(mp.mmodule.get(), make_mlir_string_ref(*solution.if_string()));
 }
@@ -988,6 +1042,7 @@ std::string dump_mlir(const module& m, const std::vector<shape>& inputs)
         mr = &mm;
         adjust_param_shapes(mm, inputs);
     }
+    rewrite_reduce(mm);
     mlir_program mp;
     mp.parse(*mr);
     auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
@@ -1002,6 +1057,7 @@ mlir_code_object compile_mlir(const context& migraphx_ctx,
                               const value& solution)
 {
     adjust_param_shapes(m, in_shapes);
+    rewrite_reduce(m);
     const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
 
     static std::mutex mutex;
@@ -1081,12 +1137,21 @@ tuning_config get_tuning_config_mlir(const context& migraphx_ctx,
                                      bool exhaustive)
 {
     adjust_param_shapes(m, inputs);
-
+    rewrite_reduce(m);
     mlir_program mp;
     mp.set_gpu_properties(migraphx_ctx);
     mp.parse(m);
     auto tc = mp.get_tuning_config(exhaustive);
-
+    std::string problem_config = tc.problem.to<std::string>();
+    for(const auto i : iterator_for(m))
+    {
+        if(starts_with(i->name(), "@"))
+        {
+            continue;
+        }
+        problem_config += " " + i->name();
+    }
+    tc.problem       = problem_config;
     const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
     static std::mutex mutex;
     if(trace)