From 5a0cf97bac046977a4056cd3b22de8bd0b54f0f4 Mon Sep 17 00:00:00 2001 From: Paul Fultz II Date: Tue, 18 Jun 2024 16:59:51 -0500 Subject: [PATCH] Show mlir program when tracing benchmarking (#2741) --- docs/dev/env_vars.rst | 1 + src/targets/gpu/compile_ops.cpp | 33 ++++++++++--- .../gpu/include/migraphx/gpu/compiler.hpp | 46 +++++++++++++++---- src/targets/gpu/include/migraphx/gpu/mlir.hpp | 1 + src/targets/gpu/jit/mlir.cpp | 10 +++- src/targets/gpu/mlir.cpp | 41 ++++++++++++----- 6 files changed, 104 insertions(+), 28 deletions(-) diff --git a/docs/dev/env_vars.rst b/docs/dev/env_vars.rst index 70135ad6836..36c830383a4 100644 --- a/docs/dev/env_vars.rst +++ b/docs/dev/env_vars.rst @@ -239,6 +239,7 @@ Defaults to 1. Set to "1" to print benchmarking trace. Set to "2" to print detailed benchmarking trace. +Set to "3" to print compiled traces. MLIR vars ------------- diff --git a/src/targets/gpu/compile_ops.cpp b/src/targets/gpu/compile_ops.cpp index 8701aa8426e..cc5a7fc24d7 100644 --- a/src/targets/gpu/compile_ops.cpp +++ b/src/targets/gpu/compile_ops.cpp @@ -82,6 +82,12 @@ struct compiled_result { compiler_replace replace; instruction_ref ins; + + friend std::ostream& operator<<(std::ostream& os, const compiled_result& cr) + { + cr.replace.trace(os, cr.ins); + return os; + } }; struct compile_plan @@ -153,22 +159,33 @@ struct compile_plan insert_compiles(compiles, value{}, 0); } } + std::string problem_string() const + { + if(config) + return to_string(config->problem); + return ""; + } + const compiled_result& benchmark() const { const auto trace_level = value_of(MIGRAPHX_TRACE_BENCHMARKING{}); + if(trace_level > 0 and not results.empty()) + { + std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs" + << std::endl; + } if(results.empty()) - MIGRAPHX_THROW("No configs to tune"); + MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " + + problem_string()); if(results.size() == 1) { if(not results.front().has_value()) - MIGRAPHX_THROW("No configs to tune"); + MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " + + problem_string()); return *results.front(); } if(not config) - MIGRAPHX_THROW("Multiple kernels without config"); - if(trace_level > 0) - std::cout << "Benchmarking " << preop.name() << ": " << results.size() << " configs" - << std::endl; + MIGRAPHX_THROW("Multiple kernels without config for " + preop.name()); if(trace_level > 1) std::cout << "Problem: " << config->problem << std::endl; std::vector times; @@ -186,6 +203,8 @@ struct compile_plan std::cout << "No binary" << std::endl; return std::numeric_limits::max(); } + if(trace_level > 2) + std::cout << *cr << std::endl; /* create a small program with insturction being compiled and call "replace" on that which would insert all the compiled code objects, prefills etc. @@ -220,7 +239,7 @@ struct compile_plan ctx->get_problem_cache().insert(preop.name(), config->problem, config->solutions.at(i)); if(not results[i].has_value()) MIGRAPHX_THROW("No valid tuned compilation for " + preop.name() + " with " + - to_string(config->problem)); + problem_string()); auto skipped = std::count_if( results.begin(), results.end(), [](const auto& cr) { return not cr.has_value(); }); if(skipped > 0) diff --git a/src/targets/gpu/include/migraphx/gpu/compiler.hpp b/src/targets/gpu/include/migraphx/gpu/compiler.hpp index 03aa79c3388..30f92705133 100644 --- a/src/targets/gpu/include/migraphx/gpu/compiler.hpp +++ b/src/targets/gpu/include/migraphx/gpu/compiler.hpp @@ -48,26 +48,48 @@ struct compiler_replace compiler_replace(const operation& op) : code_objects{{op}} {} template - compiler_replace(const operation& op, F f) - : code_objects{{op}}, - replace_fn([=](const compiler_replace& cr, module& m, instruction_ref ins) { - f(m, ins, cr.code_objects.front()); - }) + compiler_replace(const operation& op, F f) : code_objects{{op}}, replace_fn(make_replace(f)) + { + } + + template + compiler_replace(const operation& op, F f, Trace t) + : code_objects{{op}}, replace_fn(make_replace(f)), trace_fn(t) { } template compiler_replace(const std::vector& op, F f) - : code_objects{op}, - replace_fn([=](const compiler_replace& cr, module& m, instruction_ref ins) { - f(m, ins, cr.code_objects); - }) + : code_objects{op}, replace_fn(make_replace_all(f)) + { + } + + template + compiler_replace(const std::vector& op, F f, Trace t) + : code_objects{op}, replace_fn(make_replace_all(f)), trace_fn(t) { } std::vector code_objects = {}; std::function replace_fn = nullptr; + std::function trace_fn = nullptr; + + template + static auto make_replace(F f) + { + return [=](const compiler_replace& cr, module& m, instruction_ref ins) { + f(m, ins, cr.code_objects.front()); + }; + } + + template + static auto make_replace_all(F f) + { + return [=](const compiler_replace& cr, module& m, instruction_ref ins) { + f(m, ins, cr.code_objects); + }; + } void replace(module& m, instruction_ref ins) const { @@ -82,6 +104,12 @@ struct compiler_replace m.replace_instruction(ins, code_objects.front(), ins->inputs()); } } + + void trace(std::ostream& os, instruction_ref ins) const + { + if(trace_fn) + trace_fn(os, ins); + } }; using compiler_compile = diff --git a/src/targets/gpu/include/migraphx/gpu/mlir.hpp b/src/targets/gpu/include/migraphx/gpu/mlir.hpp index dc395b8eece..8f359fcd38f 100644 --- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp +++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp @@ -38,6 +38,7 @@ struct module; namespace gpu { MIGRAPHX_GPU_EXPORT std::string dump_mlir(const module& m); +MIGRAPHX_GPU_EXPORT std::string dump_mlir(const module& m, const std::vector& inputs); MIGRAPHX_GPU_EXPORT bool is_module_fusible(const module& m, const context& migraphx_ctx, const value& solution); diff --git a/src/targets/gpu/jit/mlir.cpp b/src/targets/gpu/jit/mlir.cpp index 0ec647f6a8b..74181298ab3 100644 --- a/src/targets/gpu/jit/mlir.cpp +++ b/src/targets/gpu/jit/mlir.cpp @@ -121,7 +121,8 @@ struct mlir_compiler : compiler } auto mlir = insert_mlir(m, ins, any_cast(ops.front()), inputs); return m.replace_instruction(ins, mlir); - }}; + }, + &trace}; } compiler_replace insert(const std::vector& mcos, @@ -202,6 +203,13 @@ struct mlir_compiler : compiler auto* smod = ins->module_inputs().front(); return get_tuning_config_mlir(ctx, *smod, shapes, exhaustive); } + + static void trace(std::ostream& os, instruction_ref ins) + { + auto shapes = to_shapes(ins->inputs()); + auto* smod = ins->module_inputs().front(); + os << dump_mlir(*smod, shapes); + } }; } // namespace gpu diff --git a/src/targets/gpu/mlir.cpp b/src/targets/gpu/mlir.cpp index d12e9b56a2d..0f9f5ce427c 100644 --- a/src/targets/gpu/mlir.cpp +++ b/src/targets/gpu/mlir.cpp @@ -956,14 +956,6 @@ bool is_module_fusible(const module& m, const context& migraphx_ctx, const value return mlirIsModuleFusible(mp.mmodule.get(), make_mlir_string_ref(*solution.if_string())); } -std::string dump_mlir(const module& m) -{ - mlir_program mp; - mp.parse(m); - auto mod_op = mlirModuleGetOperation(mp.mmodule.get()); - return mlir_print(&mlirOperationPrint, mod_op); -} - void adjust_param_shapes(module& m, const std::vector& inputs) { auto names = m.get_parameter_names(); @@ -982,6 +974,24 @@ void adjust_param_shapes(module& m, const std::vector& inputs) } } +std::string dump_mlir(const module& m, const std::vector& inputs) +{ + module mm; + const_module_ref mr = &m; + if(not inputs.empty()) + { + mm = m; + mr = &mm; + adjust_param_shapes(mm, inputs); + } + mlir_program mp; + mp.parse(*mr); + auto mod_op = mlirModuleGetOperation(mp.mmodule.get()); + return mlir_print(&mlirOperationPrint, mod_op); +} + +std::string dump_mlir(const module& m) { return dump_mlir(m, {}); } + mlir_code_object compile_mlir(const context& migraphx_ctx, module m, const std::vector& in_shapes, @@ -1063,27 +1073,36 @@ tuning_config get_tuning_config_mlir(const context& migraphx_ctx, mlir_program mp; mp.set_gpu_properties(migraphx_ctx); mp.parse(m); + auto tc = mp.get_tuning_config(exhaustive); const bool trace = enabled(MIGRAPHX_TRACE_MLIR{}); static std::mutex mutex; if(trace) { const std::lock_guard lock(mutex); + std::cout << "Problem: " << tc.problem << std::endl; auto mod_op = mlirModuleGetOperation(mp.mmodule.get()); std::cout << mlir_print(&mlirOperationPrint, mod_op) << std::endl; } - return mp.get_tuning_config(exhaustive); + return tc; } #else -std::string dump_mlir(const module&) { return {}; } - template void use(T&) { } +std::string dump_mlir(const module&) { return {}; } + +std::string dump_mlir(const module& m, const std::vector& inputs) +{ + use(m); + use(inputs); + return {}; +} + // Disabling clang-tidy warning on non-real useage. // NOLINTBEGIN(performance-unnecessary-value-param) mlir_code_object compile_mlir(const context&, module, const std::vector&, const value&)