From 968de0878ff38fa481dc621ed989f04af564f9c2 Mon Sep 17 00:00:00 2001 From: Umang Yadav <29876643+umangyadav@users.noreply.github.com> Date: Thu, 18 Apr 2024 17:47:44 -0400 Subject: [PATCH 1/2] Benchmark candidate programs instead of code objects in compile_ops (#2982) --- src/include/migraphx/program.hpp | 2 + src/program.cpp | 7 +++ src/targets/gpu/compile_ops.cpp | 35 +++++++++----- .../gpu/include/migraphx/gpu/time_op.hpp | 7 ++- src/targets/gpu/time_op.cpp | 46 ++++++++++++++----- 5 files changed, 73 insertions(+), 24 deletions(-) diff --git a/src/include/migraphx/program.hpp b/src/include/migraphx/program.hpp index 741063743e6..5a4884faca4 100644 --- a/src/include/migraphx/program.hpp +++ b/src/include/migraphx/program.hpp @@ -80,6 +80,8 @@ struct MIGRAPHX_EXPORT program std::vector eval(parameter_map params, execution_environment exec_env = execution_environment{}) const; + std::vector eval_with_context(std::vector& ctx, parameter_map params) const; + void finish() const; std::size_t size() const; diff --git a/src/program.cpp b/src/program.cpp index fb24685fcbc..91935fba0e0 100644 --- a/src/program.cpp +++ b/src/program.cpp @@ -523,6 +523,13 @@ std::vector generic_eval(const program& p, return generic_eval(mm, ctx, params, {}, trace); } +std::vector program::eval_with_context(std::vector& ctx, + parameter_map params) const +{ + const module* mm = this->get_main_module(); + return generic_eval(mm, ctx, std::move(params), {}, [](auto&&, auto f) { return f(); }); +} + std::vector program::eval(parameter_map params, execution_environment exec_env) const { auto& contexts = this->impl->contexts; diff --git a/src/targets/gpu/compile_ops.cpp b/src/targets/gpu/compile_ops.cpp index 66bfb7e2052..730708c143c 100644 --- a/src/targets/gpu/compile_ops.cpp +++ b/src/targets/gpu/compile_ops.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include #include #include #include @@ -185,17 +186,29 @@ struct compile_plan std::cout << "No binary" << std::endl; return std::numeric_limits::max(); } - // Time all the code objects for a given perf config and calculate total - // time e.g. in case of split-K GEMM, it may or may not support fusion. - // In that case MLIR compile would return code objects for individual - // GEMM and pre/post fusion code objects. - auto cobjs = cr->replace.code_objects; - double t = transform_accumulate( - cobjs.begin(), - cobjs.end(), - double{0}, - std::plus<>{}, - [&](const operation& op) { return time_op(*ctx, op, 20); }); + /* + create a small program with insturction being compiled and call "replace" + on that which would insert all the compiled code objects, prefills etc. + necessary to run candidate code object + */ + program bench_prog; + auto* bench_mm = bench_prog.get_main_module(); + std::vector bench_ins_inputs; + + std::transform(cr->ins->inputs().begin(), + cr->ins->inputs().end(), + std::back_inserter(bench_ins_inputs), + [&](const auto& arg) { + return bench_mm->add_parameter( + std::to_string(bench_ins_inputs.size()), + arg->get_shape()); + }); + auto bench_ins = bench_mm->add_instruction( + cr->ins->get_operator(), bench_ins_inputs, cr->ins->module_inputs()); + cr->replace.replace(*bench_mm, bench_ins); + // do dead code elimination by directly removing instruction + bench_mm->remove_instruction(bench_ins); + auto t = time_program(*ctx, bench_prog, 20); if(trace_level > 1) std::cout << t << "ms" << std::endl; return t; diff --git a/src/targets/gpu/include/migraphx/gpu/time_op.hpp b/src/targets/gpu/include/migraphx/gpu/time_op.hpp index 69a4767afcf..2c5893eed2f 100644 --- a/src/targets/gpu/include/migraphx/gpu/time_op.hpp +++ b/src/targets/gpu/include/migraphx/gpu/time_op.hpp @@ -24,6 +24,7 @@ #ifndef MIGRAPHX_GUARD_GPU_DRIVER_PERF_HPP #define MIGRAPHX_GUARD_GPU_DRIVER_PERF_HPP +#include #include #include #include @@ -33,10 +34,12 @@ inline namespace MIGRAPHX_INLINE_NS { namespace gpu { MIGRAPHX_GPU_EXPORT double -time_op(context& ictx, operation op, const std::vector& inputs, int n = 100); +time_op(const context& ictx, operation op, const std::vector& inputs, int n = 100); + +MIGRAPHX_GPU_EXPORT double time_program(const context& ictx, program p, int n = 100); /* benchmark gpu::code_object with expected input shapes over n iterations */ -MIGRAPHX_GPU_EXPORT double time_op(context& ictx, operation op, int n = 100); +MIGRAPHX_GPU_EXPORT double time_op(const context& ictx, operation op, int n = 100); } // namespace gpu } // namespace MIGRAPHX_INLINE_NS diff --git a/src/targets/gpu/time_op.cpp b/src/targets/gpu/time_op.cpp index 51459b64da5..5321bc9d775 100644 --- a/src/targets/gpu/time_op.cpp +++ b/src/targets/gpu/time_op.cpp @@ -21,6 +21,7 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ +#include #include #include #include @@ -41,35 +42,58 @@ std::vector generate_arguments(const std::vector& shapes, unsig return args; } -double time_op(context& ictx, operation op, const std::vector& inputs, int n) +template +double time_loop(migraphx::gpu::context& gctx, int n, F f) { - // TODO: Use std::ref - migraphx::context ctx = ictx; - auto& gctx = any_cast(ctx); - auto output = op.compute_shape(inputs); - op.finalize(ctx, output, inputs); - auto args = generate_arguments(inputs); auto start = context::create_event_for_timing(); auto stop = context::create_event_for_timing(); - auto run = [&] { op.compute(ctx, output, args); }; - run(); + f(); gctx.get_stream().record(start.get()); for(auto i : range(n)) { (void)i; - run(); + f(); } gctx.get_stream().record(stop.get()); gctx.finish(); return context::get_elapsed_ms(start.get(), stop.get()) / n; } -double time_op(context& ictx, operation op, int n) +double time_op(const context& ictx, operation op, const std::vector& inputs, int n) +{ + // TODO: Use std::ref + migraphx::context ctx = ictx; + auto& gctx = any_cast(ctx); + auto output = op.compute_shape(inputs); + op.finalize(ctx, output, inputs); + auto args = generate_arguments(inputs); + auto run = [&] { op.compute(ctx, output, args); }; + return time_loop(gctx, n, run); +} + +double time_op(const context& ictx, operation op, int n) { auto inputs = any_cast(op).expected_inputs; return time_op(ictx, op, inputs, n); } +double time_program(const context& ictx, program p, int n) +{ + std::vector ctx_vec = {ictx}; + auto& gctx = any_cast(ctx_vec.front()); + auto* mm = p.get_main_module(); + mm->finalize(ctx_vec); + auto in_shapes = p.get_parameter_shapes(); + std::unordered_map param_map; + unsigned long seed = 0; + for(const auto& [name, shape] : in_shapes) + { + param_map[name] = to_gpu(generate_argument(shape, seed++)); + } + auto run = [&] { p.eval_with_context(ctx_vec, param_map); }; + return time_loop(gctx, n, run); +} + } // namespace gpu } // namespace MIGRAPHX_INLINE_NS } // namespace migraphx From f6e22cb9bd09d2d2cd37b5bc6d03fdfe227890c6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mirza=20Halil=C4=8Devi=C4=87?= <109971222+mirza-halilcevic@users.noreply.github.com> Date: Fri, 19 Apr 2024 18:23:54 +0200 Subject: [PATCH 2/2] Multibroadcast -> GEMM issue (#2943) --- src/targets/gpu/gemm_impl.cpp | 4 +- test/verify/test_gemm_multibroadcast.cpp | 49 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+), 1 deletion(-) create mode 100644 test/verify/test_gemm_multibroadcast.cpp diff --git a/src/targets/gpu/gemm_impl.cpp b/src/targets/gpu/gemm_impl.cpp index bb0c60fa03b..1dc7ac4280e 100644 --- a/src/targets/gpu/gemm_impl.cpp +++ b/src/targets/gpu/gemm_impl.cpp @@ -79,8 +79,10 @@ void blas_shape(const shape& s) { if(s.lens().size() < 2) return; - if(std::none_of(s.strides().end() - 2, s.strides().end(), [&](auto i) { return i == 1; })) + if(std::none_of(s.strides().end() - 2, s.strides().end(), [](auto i) { return i == 1; })) MIGRAPHX_THROW("GPU_GEMM: needs to have one matrix stride as 1"); + if(std::any_of(s.strides().end() - 2, s.strides().end(), [](auto i) { return i == 0; })) + MIGRAPHX_THROW("GPU_GEMM: matrix dimensions can't be broadcasted"); if(s.lens().size() < 3) return; shape batch_shape{s.type(), diff --git a/test/verify/test_gemm_multibroadcast.cpp b/test/verify/test_gemm_multibroadcast.cpp new file mode 100644 index 00000000000..6c8e5f5c50f --- /dev/null +++ b/test/verify/test_gemm_multibroadcast.cpp @@ -0,0 +1,49 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2024 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +template +struct test_gemm_multibroadcast : verify_program> +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + auto a = mm->add_parameter("a", migraphx::shape{DType, {2, 2, 1025}}); + auto b = mm->add_parameter("b", migraphx::shape{DType, {2, 1, 2}}); + auto bb = mm->add_instruction( + migraphx::make_op("multibroadcast", {{"out_lens", {2, 1025, 2}}}), b); + mm->add_instruction(migraphx::make_op("dot"), a, bb); + return p; + } + std::string section() const { return "gemm"; } +}; + +template struct test_gemm_multibroadcast; +template struct test_gemm_multibroadcast; +template struct test_gemm_multibroadcast;