From 5a0cf97bac046977a4056cd3b22de8bd0b54f0f4 Mon Sep 17 00:00:00 2001
From: Paul Fultz II <pfultz2@yahoo.com>
Date: Tue, 18 Jun 2024 16:59:51 -0500
Subject: [PATCH] Show mlir program when tracing benchmarking (#2741)

---
 docs/dev/env_vars.rst                         |  1 +
 src/targets/gpu/compile_ops.cpp               | 33 ++++++++++---
 .../gpu/include/migraphx/gpu/compiler.hpp     | 46 +++++++++++++++----
 src/targets/gpu/include/migraphx/gpu/mlir.hpp |  1 +
 src/targets/gpu/jit/mlir.cpp                  | 10 +++-
 src/targets/gpu/mlir.cpp                      | 41 ++++++++++++-----
 6 files changed, 104 insertions(+), 28 deletions(-)
diff --git a/docs/dev/env_vars.rst b/docs/dev/env_vars.rst
index 70135ad6836..36c830383a4 100644
--- a/docs/dev/env_vars.rst
+++ b/docs/dev/env_vars.rst
@@ -239,6 +239,7 @@ Defaults to 1.
 
 Set to "1" to print benchmarking trace.
 Set to "2" to print detailed benchmarking trace.
+Set to "3" to print compiled traces.
 
 MLIR vars
 -------------
diff --git a/src/targets/gpu/compile_ops.cpp b/src/targets/gpu/compile_ops.cpp
index 8701aa8426e..cc5a7fc24d7 100644
--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -82,6 +82,12 @@ struct compiled_result
 {
     compiler_replace replace;
     instruction_ref ins;
+
+    friend std::ostream& operator<<(std::ostream& os, const compiled_result& cr)
+    {
+        cr.replace.trace(os, cr.ins);
+        return os;
+    }
 };
 
 struct compile_plan
@@ -153,22 +159,33 @@ struct compile_plan
             insert_compiles(compiles, value{}, 0);
         }
     }
+    std::string problem_string() const
+    {
+        if(config)
+            return to_string(config->problem);
+        return "<no problem key>";
+    }
+
     const compiled_result& benchmark() const
     {
         const auto trace_level = value_of(MIGRAPHX_TRACE_BENCHMARKING{});
+        if(trace_level > 0 and not results.empty())
+        {
+            std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs"
+                      << std::endl;
+        }
         if(results.empty())
-            MIGRAPHX_THROW("No configs to tune");
+            MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " +
+                           problem_string());
         if(results.size() == 1)
         {
             if(not results.front().has_value())
-                MIGRAPHX_THROW("No configs to tune");
+                MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " +
+                               problem_string());
             return *results.front();
         }
         if(not config)
-            MIGRAPHX_THROW("Multiple kernels without config");
-        if(trace_level > 0)
-            std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs"
-                      << std::endl;
+            MIGRAPHX_THROW("Multiple kernels without config for " + preop.name());
         if(trace_level > 1)
             std::cout << "Problem: " << config->problem << std::endl;
         std::vector<double> times;
@@ -186,6 +203,8 @@ struct compile_plan
                                    std::cout << "No binary" << std::endl;
                                return std::numeric_limits<double>::max();
                            }
+                           if(trace_level > 2)
+                               std::cout << *cr << std::endl;
                            /*
                            create a small program with insturction being compiled and call "replace"
                            on that which would insert all the compiled code objects, prefills etc.
@@ -220,7 +239,7 @@ struct compile_plan
         ctx->get_problem_cache().insert(preop.name(), config->problem, config->solutions.at(i));
         if(not results[i].has_value())
             MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " +
-                           to_string(config->problem));
+                           problem_string());
         auto skipped = std::count_if(
             results.begin(), results.end(), [](const auto& cr) { return not cr.has_value(); });
         if(skipped > 0)
diff --git a/src/targets/gpu/include/migraphx/gpu/compiler.hpp b/src/targets/gpu/include/migraphx/gpu/compiler.hpp
index 03aa79c3388..30f92705133 100644
--- a/src/targets/gpu/include/migraphx/gpu/compiler.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compiler.hpp
@@ -48,26 +48,48 @@ struct compiler_replace
     compiler_replace(const operation& op) : code_objects{{op}} {}
 
     template <class F>
-    compiler_replace(const operation& op, F f)
-        : code_objects{{op}},
-          replace_fn([=](const compiler_replace& cr, module& m, instruction_ref ins) {
-              f(m, ins, cr.code_objects.front());
-          })
+    compiler_replace(const operation& op, F f) : code_objects{{op}}, replace_fn(make_replace(f))
+    {
+    }
+
+    template <class F, class Trace>
+    compiler_replace(const operation& op, F f, Trace t)
+        : code_objects{{op}}, replace_fn(make_replace(f)), trace_fn(t)
     {
     }
 
     template <class F>
     compiler_replace(const std::vector<operation>& op, F f)
-        : code_objects{op},
-          replace_fn([=](const compiler_replace& cr, module& m, instruction_ref ins) {
-              f(m, ins, cr.code_objects);
-          })
+        : code_objects{op}, replace_fn(make_replace_all(f))
+    {
+    }
+
+    template <class F, class Trace>
+    compiler_replace(const std::vector<operation>& op, F f, Trace t)
+        : code_objects{op}, replace_fn(make_replace_all(f)), trace_fn(t)
     {
     }
 
     std::vector<operation> code_objects = {};
     std::function<void(const compiler_replace& cr, module& m, instruction_ref ins)> replace_fn =
         nullptr;
+    std::function<void(std::ostream& os, instruction_ref ins)> trace_fn = nullptr;
+
+    template <class F>
+    static auto make_replace(F f)
+    {
+        return [=](const compiler_replace& cr, module& m, instruction_ref ins) {
+            f(m, ins, cr.code_objects.front());
+        };
+    }
+
+    template <class F>
+    static auto make_replace_all(F f)
+    {
+        return [=](const compiler_replace& cr, module& m, instruction_ref ins) {
+            f(m, ins, cr.code_objects);
+        };
+    }
 
     void replace(module& m, instruction_ref ins) const
     {
@@ -82,6 +104,12 @@ struct compiler_replace
             m.replace_instruction(ins, code_objects.front(), ins->inputs());
         }
     }
+
+    void trace(std::ostream& os, instruction_ref ins) const
+    {
+        if(trace_fn)
+            trace_fn(os, ins);
+    }
 };
 
 using compiler_compile =
diff --git a/src/targets/gpu/include/migraphx/gpu/mlir.hpp b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
index dc395b8eece..8f359fcd38f 100644
--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
@@ -38,6 +38,7 @@ struct module;
 namespace gpu {
 
 MIGRAPHX_GPU_EXPORT std::string dump_mlir(const module& m);
+MIGRAPHX_GPU_EXPORT std::string dump_mlir(const module& m, const std::vector<shape>& inputs);
 
 MIGRAPHX_GPU_EXPORT bool
 is_module_fusible(const module& m, const context& migraphx_ctx, const value& solution);
diff --git a/src/targets/gpu/jit/mlir.cpp b/src/targets/gpu/jit/mlir.cpp
index 0ec647f6a8b..74181298ab3 100644
--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
@@ -121,7 +121,8 @@ struct mlir_compiler : compiler<mlir_compiler>
                     }
                     auto mlir = insert_mlir(m, ins, any_cast<code_object_op>(ops.front()), inputs);
                     return m.replace_instruction(ins, mlir);
-                }};
+                },
+                &trace};
     }
 
     compiler_replace insert(const std::vector<mlir_code_object>& mcos,
@@ -202,6 +203,13 @@ struct mlir_compiler : compiler<mlir_compiler>
         auto* smod  = ins->module_inputs().front();
         return get_tuning_config_mlir(ctx, *smod, shapes, exhaustive);
     }
+
+    static void trace(std::ostream& os, instruction_ref ins)
+    {
+        auto shapes = to_shapes(ins->inputs());
+        auto* smod  = ins->module_inputs().front();
+        os << dump_mlir(*smod, shapes);
+    }
 };
 
 } // namespace gpu
diff --git a/src/targets/gpu/mlir.cpp b/src/targets/gpu/mlir.cpp
index d12e9b56a2d..0f9f5ce427c 100644
--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -956,14 +956,6 @@ bool is_module_fusible(const module& m, const context& migraphx_ctx, const value
     return mlirIsModuleFusible(mp.mmodule.get(), make_mlir_string_ref(*solution.if_string()));
 }
 
-std::string dump_mlir(const module& m)
-{
-    mlir_program mp;
-    mp.parse(m);
-    auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
-    return mlir_print(&mlirOperationPrint, mod_op);
-}
-
 void adjust_param_shapes(module& m, const std::vector<shape>& inputs)
 {
     auto names = m.get_parameter_names();
@@ -982,6 +974,24 @@ void adjust_param_shapes(module& m, const std::vector<shape>& inputs)
     }
 }
 
+std::string dump_mlir(const module& m, const std::vector<shape>& inputs)
+{
+    module mm;
+    const_module_ref mr = &m;
+    if(not inputs.empty())
+    {
+        mm = m;
+        mr = &mm;
+        adjust_param_shapes(mm, inputs);
+    }
+    mlir_program mp;
+    mp.parse(*mr);
+    auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
+    return mlir_print(&mlirOperationPrint, mod_op);
+}
+
+std::string dump_mlir(const module& m) { return dump_mlir(m, {}); }
+
 mlir_code_object compile_mlir(const context& migraphx_ctx,
                               module m,
                               const std::vector<shape>& in_shapes,
@@ -1063,27 +1073,36 @@ tuning_config get_tuning_config_mlir(const context& migraphx_ctx,
     mlir_program mp;
     mp.set_gpu_properties(migraphx_ctx);
     mp.parse(m);
+    auto tc = mp.get_tuning_config(exhaustive);
 
     const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
     static std::mutex mutex;
     if(trace)
     {
         const std::lock_guard<std::mutex> lock(mutex);
+        std::cout << "Problem: " << tc.problem << std::endl;
         auto mod_op = mlirModuleGetOperation(mp.mmodule.get());
         std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl;
     }
-    return mp.get_tuning_config(exhaustive);
+    return tc;
 }
 
 #else
 
-std::string dump_mlir(const module&) { return {}; }
-
 template <class T>
 void use(T&)
 {
 }
 
+std::string dump_mlir(const module&) { return {}; }
+
+std::string dump_mlir(const module& m, const std::vector<shape>& inputs)
+{
+    use(m);
+    use(inputs);
+    return {};
+}
+
 // Disabling clang-tidy warning on non-real useage.
 // NOLINTBEGIN(performance-unnecessary-value-param)
 mlir_code_object compile_mlir(const context&, module, const std::vector<shape>&, const value&)