From 968de0878ff38fa481dc621ed989f04af564f9c2 Mon Sep 17 00:00:00 2001
From: Umang Yadav <29876643+umangyadav@users.noreply.github.com>
Date: Thu, 18 Apr 2024 17:47:44 -0400
Subject: [PATCH 1/2] Benchmark candidate programs instead of code objects in
 compile_ops (#2982)

---
 src/include/migraphx/program.hpp              |  2 +
 src/program.cpp                               |  7 +++
 src/targets/gpu/compile_ops.cpp               | 35 +++++++++-----
 .../gpu/include/migraphx/gpu/time_op.hpp      |  7 ++-
 src/targets/gpu/time_op.cpp                   | 46 ++++++++++++++-----
 5 files changed, 73 insertions(+), 24 deletions(-)
diff --git a/src/include/migraphx/program.hpp b/src/include/migraphx/program.hpp
index 741063743e6..5a4884faca4 100644
--- a/src/include/migraphx/program.hpp
+++ b/src/include/migraphx/program.hpp
@@ -80,6 +80,8 @@ struct MIGRAPHX_EXPORT program
     std::vector<argument> eval(parameter_map params,
                                execution_environment exec_env = execution_environment{}) const;
 
+    std::vector<argument> eval_with_context(std::vector<context>& ctx, parameter_map params) const;
+
     void finish() const;
 
     std::size_t size() const;
diff --git a/src/program.cpp b/src/program.cpp
index fb24685fcbc..91935fba0e0 100644
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -523,6 +523,13 @@ std::vector<argument> generic_eval(const program& p,
     return generic_eval(mm, ctx, params, {}, trace);
 }
 
+std::vector<argument> program::eval_with_context(std::vector<context>& ctx,
+                                                 parameter_map params) const
+{
+    const module* mm = this->get_main_module();
+    return generic_eval(mm, ctx, std::move(params), {}, [](auto&&, auto f) { return f(); });
+}
+
 std::vector<argument> program::eval(parameter_map params, execution_environment exec_env) const
 {
     auto& contexts = this->impl->contexts;
diff --git a/src/targets/gpu/compile_ops.cpp b/src/targets/gpu/compile_ops.cpp
index 66bfb7e2052..730708c143c 100644
--- a/src/targets/gpu/compile_ops.cpp
+++ b/src/targets/gpu/compile_ops.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include <migraphx/program.hpp>
 #include <migraphx/module.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/instruction.hpp>
@@ -185,17 +186,29 @@ struct compile_plan
                                    std::cout << "No binary" << std::endl;
                                return std::numeric_limits<double>::max();
                            }
-                           // Time all the code objects for a given perf config and calculate total
-                           // time e.g. in case of split-K GEMM, it may or may not support fusion.
-                           // In that case MLIR compile would return code objects for individual
-                           // GEMM and pre/post fusion code objects.
-                           auto cobjs = cr->replace.code_objects;
-                           double t   = transform_accumulate(
-                               cobjs.begin(),
-                               cobjs.end(),
-                               double{0},
-                               std::plus<>{},
-                               [&](const operation& op) { return time_op(*ctx, op, 20); });
+                           /*
+                           create a small program with insturction being compiled and call "replace"
+                           on that which would insert all the compiled code objects, prefills etc.
+                           necessary to run candidate code object
+                           */
+                           program bench_prog;
+                           auto* bench_mm = bench_prog.get_main_module();
+                           std::vector<instruction_ref> bench_ins_inputs;
+
+                           std::transform(cr->ins->inputs().begin(),
+                                          cr->ins->inputs().end(),
+                                          std::back_inserter(bench_ins_inputs),
+                                          [&](const auto& arg) {
+                                              return bench_mm->add_parameter(
+                                                  std::to_string(bench_ins_inputs.size()),
+                                                  arg->get_shape());
+                                          });
+                           auto bench_ins = bench_mm->add_instruction(
+                               cr->ins->get_operator(), bench_ins_inputs, cr->ins->module_inputs());
+                           cr->replace.replace(*bench_mm, bench_ins);
+                           // do dead code elimination by directly removing instruction
+                           bench_mm->remove_instruction(bench_ins);
+                           auto t = time_program(*ctx, bench_prog, 20);
                            if(trace_level > 1)
                                std::cout << t << "ms" << std::endl;
                            return t;
diff --git a/src/targets/gpu/include/migraphx/gpu/time_op.hpp b/src/targets/gpu/include/migraphx/gpu/time_op.hpp
index 69a4767afcf..2c5893eed2f 100644
--- a/src/targets/gpu/include/migraphx/gpu/time_op.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/time_op.hpp
@@ -24,6 +24,7 @@
 #ifndef MIGRAPHX_GUARD_GPU_DRIVER_PERF_HPP
 #define MIGRAPHX_GUARD_GPU_DRIVER_PERF_HPP
 
+#include <migraphx/program.hpp>
 #include <migraphx/config.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/operation.hpp>
@@ -33,10 +34,12 @@ inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 
 MIGRAPHX_GPU_EXPORT double
-time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);
+time_op(const context& ictx, operation op, const std::vector<shape>& inputs, int n = 100);
+
+MIGRAPHX_GPU_EXPORT double time_program(const context& ictx, program p, int n = 100);
 
 /* benchmark gpu::code_object with expected input shapes over n iterations */
-MIGRAPHX_GPU_EXPORT double time_op(context& ictx, operation op, int n = 100);
+MIGRAPHX_GPU_EXPORT double time_op(const context& ictx, operation op, int n = 100);
 
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
diff --git a/src/targets/gpu/time_op.cpp b/src/targets/gpu/time_op.cpp
index 51459b64da5..5321bc9d775 100644
--- a/src/targets/gpu/time_op.cpp
+++ b/src/targets/gpu/time_op.cpp
@@ -21,6 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  * THE SOFTWARE.
  */
+#include <migraphx/program.hpp>
 #include <migraphx/gpu/time_op.hpp>
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/context.hpp>
@@ -41,35 +42,58 @@ std::vector<argument> generate_arguments(const std::vector<shape>& shapes, unsig
     return args;
 }
 
-double time_op(context& ictx, operation op, const std::vector<shape>& inputs, int n)
+template <class F>
+double time_loop(migraphx::gpu::context& gctx, int n, F f)
 {
-    // TODO: Use std::ref
-    migraphx::context ctx = ictx;
-    auto& gctx            = any_cast<migraphx::gpu::context>(ctx);
-    auto output           = op.compute_shape(inputs);
-    op.finalize(ctx, output, inputs);
-    auto args = generate_arguments(inputs);
     auto start = context::create_event_for_timing();
     auto stop  = context::create_event_for_timing();
-    auto run   = [&] { op.compute(ctx, output, args); };
-    run();
+    f();
     gctx.get_stream().record(start.get());
     for(auto i : range(n))
     {
         (void)i;
-        run();
+        f();
     }
     gctx.get_stream().record(stop.get());
     gctx.finish();
     return context::get_elapsed_ms(start.get(), stop.get()) / n;
 }
 
-double time_op(context& ictx, operation op, int n)
+double time_op(const context& ictx, operation op, const std::vector<shape>& inputs, int n)
+{
+    // TODO: Use std::ref
+    migraphx::context ctx = ictx;
+    auto& gctx            = any_cast<migraphx::gpu::context>(ctx);
+    auto output           = op.compute_shape(inputs);
+    op.finalize(ctx, output, inputs);
+    auto args = generate_arguments(inputs);
+    auto run  = [&] { op.compute(ctx, output, args); };
+    return time_loop(gctx, n, run);
+}
+
+double time_op(const context& ictx, operation op, int n)
 {
     auto inputs = any_cast<migraphx::gpu::code_object_op>(op).expected_inputs;
     return time_op(ictx, op, inputs, n);
 }
 
+double time_program(const context& ictx, program p, int n)
+{
+    std::vector<migraphx::context> ctx_vec = {ictx};
+    auto& gctx                             = any_cast<migraphx::gpu::context>(ctx_vec.front());
+    auto* mm                               = p.get_main_module();
+    mm->finalize(ctx_vec);
+    auto in_shapes = p.get_parameter_shapes();
+    std::unordered_map<std::string, migraphx::argument> param_map;
+    unsigned long seed = 0;
+    for(const auto& [name, shape] : in_shapes)
+    {
+        param_map[name] = to_gpu(generate_argument(shape, seed++));
+    }
+    auto run = [&] { p.eval_with_context(ctx_vec, param_map); };
+    return time_loop(gctx, n, run);
+}
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

From f6e22cb9bd09d2d2cd37b5bc6d03fdfe227890c6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Mirza=20Halil=C4=8Devi=C4=87?=
 <109971222+mirza-halilcevic@users.noreply.github.com>
Date: Fri, 19 Apr 2024 18:23:54 +0200
Subject: [PATCH 2/2] Multibroadcast -> GEMM issue (#2943)

---
 src/targets/gpu/gemm_impl.cpp            |  4 +-
 test/verify/test_gemm_multibroadcast.cpp | 49 ++++++++++++++++++++++++
 2 files changed, 52 insertions(+), 1 deletion(-)
 create mode 100644 test/verify/test_gemm_multibroadcast.cpp

diff --git a/src/targets/gpu/gemm_impl.cpp b/src/targets/gpu/gemm_impl.cpp
index bb0c60fa03b..1dc7ac4280e 100644
--- a/src/targets/gpu/gemm_impl.cpp
+++ b/src/targets/gpu/gemm_impl.cpp
@@ -79,8 +79,10 @@ void blas_shape(const shape& s)
 {
     if(s.lens().size() < 2)
         return;
-    if(std::none_of(s.strides().end() - 2, s.strides().end(), [&](auto i) { return i == 1; }))
+    if(std::none_of(s.strides().end() - 2, s.strides().end(), [](auto i) { return i == 1; }))
         MIGRAPHX_THROW("GPU_GEMM: needs to have one matrix stride as 1");
+    if(std::any_of(s.strides().end() - 2, s.strides().end(), [](auto i) { return i == 0; }))
+        MIGRAPHX_THROW("GPU_GEMM: matrix dimensions can't be broadcasted");
     if(s.lens().size() < 3)
         return;
     shape batch_shape{s.type(),
diff --git a/test/verify/test_gemm_multibroadcast.cpp b/test/verify/test_gemm_multibroadcast.cpp
new file mode 100644
index 00000000000..6c8e5f5c50f
--- /dev/null
+++ b/test/verify/test_gemm_multibroadcast.cpp
@@ -0,0 +1,49 @@
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+template <migraphx::shape::type_t DType>
+struct test_gemm_multibroadcast : verify_program<test_gemm_multibroadcast<DType>>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        auto a   = mm->add_parameter("a", migraphx::shape{DType, {2, 2, 1025}});
+        auto b   = mm->add_parameter("b", migraphx::shape{DType, {2, 1, 2}});
+        auto bb  = mm->add_instruction(
+            migraphx::make_op("multibroadcast", {{"out_lens", {2, 1025, 2}}}), b);
+        mm->add_instruction(migraphx::make_op("dot"), a, bb);
+        return p;
+    }
+    std::string section() const { return "gemm"; }
+};
+
+template struct test_gemm_multibroadcast<migraphx::shape::float_type>;
+template struct test_gemm_multibroadcast<migraphx::shape::half_type>;
+template struct test_gemm_multibroadcast<migraphx::shape::fp8e4m3fnuz_type>;