From f5ebc8f568344c60c2930ffbc2f7c57d459c08a0 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Thu, 27 Jul 2023 14:18:03 -0700 Subject: [PATCH 01/38] Fix inverted logic --- src/targets/gpu/CMakeLists.txt | 2 +- src/targets/gpu/target.cpp | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/targets/gpu/CMakeLists.txt b/src/targets/gpu/CMakeLists.txt index 92316c1bb5a..f863fc9c628 100644 --- a/src/targets/gpu/CMakeLists.txt +++ b/src/targets/gpu/CMakeLists.txt @@ -89,7 +89,7 @@ rocm_clang_tidy_check(kernel_file_check) file(GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp) -if(NOT WIN32) +if(WIN32) # TODO: re-enable when CK is ported to Windows list(REMOVE_ITEM JIT_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp) endif() diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp index 37e6c44efbe..082bc5fa949 100644 --- a/src/targets/gpu/target.cpp +++ b/src/targets/gpu/target.cpp @@ -75,7 +75,7 @@ namespace gpu { MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS) MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION) MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC) -#ifdef _WIN32 +#ifndef _WIN32 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK) #endif @@ -138,7 +138,7 @@ std::vector target::get_passes(migraphx::context& gctx, const compile_opti dead_code_elimination{}, enable_pass(not enabled(MIGRAPHX_DISABLE_REDUCE_FUSION{}), fuse_reduce{}), dead_code_elimination{}, -#ifdef _WIN32 +#ifndef _WIN32 enable_pass(enabled(MIGRAPHX_ENABLE_CK{}), fuse_ck{}), #endif dead_code_elimination{}, From 8ab0b22ef706d979928c4697321b85dd794c934d Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Tue, 29 Aug 2023 17:28:32 -0700 Subject: [PATCH 02/38] Add gemm_softmax_gemm --- requirements.txt | 2 +- src/targets/gpu/fuse_ck.cpp | 91 +++- src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp | 452 ++++++++++++++++++ .../migraphx/kernels/ck_gemm_softmax_gemm.hpp | 72 +++ test/onnx/gemm_softmax_gemm_test.onnx | Bin 0 -> 340 bytes test/onnx/gen_onnx.py | 79 +++ test/onnx/old_gemm_softmax_gemm_test.onnx | 46 ++ test/verify/ck_gemm_softmax_gemm.cpp | 59 +++ 8 files changed, 792 insertions(+), 9 deletions(-) create mode 100644 src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp create mode 100644 src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp create mode 100644 test/onnx/gemm_softmax_gemm_test.onnx create mode 100644 test/onnx/old_gemm_softmax_gemm_test.onnx create mode 100644 test/verify/ck_gemm_softmax_gemm.cpp diff --git a/requirements.txt b/requirements.txt index 623fb480ff7..e8994165133 100755 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,4 @@ ROCmSoftwarePlatform/half@rocm-5.6.0 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On -ROCmSoftwarePlatform/composable_kernel@5172ec5280f14974beee2acf1af1db3b2670244c -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On +# ROCmSoftwarePlatform/composable_kernel@5172ec5280f14974beee2acf1af1db3b2670244c -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index fc3b3e773c8..4d9d574bc2d 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -65,21 +65,62 @@ struct ck_gemm return r; return r.with_type(mods.front()->get_output_shapes().front().type()); } + + static bool is_ck_supported_type(shape::type_t t) + { + return contains({shape::half_type, shape::int8_type, shape::int32_type}, t); + } }; MIGRAPHX_REGISTER_OP(ck_gemm); -namespace { -bool is_ck_supported_type(shape::type_t t) +struct ck_gemm_softmax_gemm { - return contains({shape::half_type, shape::int8_type, shape::int32_type}, t); -} + operation op = make_op("dot"); + + template + static auto reflect(Self& self, F f) + { + return pack(f(self.op, "op")); + } + + std::string name() const { return "gpu::ck_gemm_softmax_gemm"; } + + void check_gemm_shape(const shape& s) const + { + if(not contains(range(s.strides().rbegin(), s.strides().rbegin() + 3), 1)) + MIGRAPHX_THROW("Invalid shape for ck_gemm_softmax_gemm"); + } + + shape compute_shape(std::vector inputs, const std::vector& mods) const + { + check_shapes{inputs, *this}.same_ndims(); + if(inputs.size() < 2) + MIGRAPHX_THROW("should have at least two inputs."); + auto a = inputs[0]; + auto b = inputs[1]; + auto b1 = inputs[2]; + for(const auto& input : inputs) + { + check_gemm_shape(input); + } + return op.compute_shape({op.compute_shape({a, b}), b1}); + } + + static bool is_ck_supported_type(shape::type_t t) + { + return contains({shape::half_type}, t); + } +}; +MIGRAPHX_REGISTER_OP(ck_gemm_softmax_gemm); + +namespace { MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins) { if(ins->name() != "dot" and ins->name() != "quant_dot") return false; - if(not is_ck_supported_type(ins->get_shape().type())) + if(not ck_gemm::is_ck_supported_type(ins->get_shape().type())) return false; auto a = ins->inputs().front()->get_shape(); auto b = ins->inputs().back()->get_shape(); @@ -99,9 +140,38 @@ MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins) // Skipping GEMMs with a K dimension greater than 2048 is a course-grained strategy // to avoid poor-performing GEMM kernels from CK // To-do: Investigate a more precise strategy - return k <= 2048; + return true;//k <= 2048; } +struct find_ck_gemm_softmax_gemm +{ + auto matcher() const + { + auto gemm1 = + match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); + auto mul = match::name("mul")(match::any_of[match::inputs()](gemm1)).bind("scale"); + auto softmax = match::name("softmax")(match::any_of[match::inputs()](mul)).bind("softmax"); + return match::name("dot")(is_ck_gemm().bind("gemm2"))( + match::any_of[match::inputs()](softmax)); + } + + void apply(module_pass_manager& mpm, const match::matcher_result& r) const + { + auto ins = r.result; + auto gemm2_ins = r.instructions["gemm2"]; + auto gemm1_ins = r.instructions["gemm1"]; + + // if (not ck_gemm_softmax_gemm::is_ck_supported_type(gemm1_ins->get_shape().type())) + // return; + + auto inputs = gemm1_ins->inputs(); // A, B + inputs.push_back(gemm2_ins->inputs().back()); // B1 + + mpm.get_module().replace_instruction( + ins, ck_gemm_softmax_gemm{gemm2_ins->get_operator()}, inputs); + } +}; + struct find_ck_gemm_pointwise { // Find a gemm followed by a pointwise operation. @@ -127,7 +197,11 @@ struct find_ck_gemm_pointwise ins->get_shape().type() != gemm_ins->get_shape().type()) return; if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) { - return not is_ck_supported_type(input->get_shape().type()); + return not ck_gemm::is_ck_supported_type(input->get_shape().type()); + })) + return; + if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) { + return not input->inputs().empty() and input->inputs().front()->name() == "capture"; })) return; assert(gemm_it != inputs.end()); @@ -152,7 +226,7 @@ struct find_ck_gemm_pointwise struct find_ck_gemm { - auto matcher() const { return match::name("dot")(is_ck_gemm().bind("gemm")); } + auto matcher() const { return match::name("dot", "quant_dot")(is_ck_gemm().bind("gemm")); } void apply(module_pass_manager& mpm, const match::matcher_result& r) const { @@ -165,6 +239,7 @@ struct find_ck_gemm void fuse_ck::apply(module_pass_manager& mpm) const { + match::find_matches(mpm, find_ck_gemm_softmax_gemm{}); match::find_matches(mpm, find_ck_gemm_pointwise{}); match::find_matches(mpm, find_ck_gemm{}); } diff --git a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp new file mode 100644 index 00000000000..4598a24aea6 --- /dev/null +++ b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp @@ -0,0 +1,452 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "ck/host/device_batched_gemm_softmax_gemm.hpp" + +namespace migraphx { +inline namespace MIGRAPHX_INLINE_NS { + +namespace gpu { + +using namespace migraphx::gpu::gen; // NOLINT + +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING_VALUE); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK); + +// NOLINTNEXTLINE +static const char* const ck_gemm_softmax_gemm_kernel = R"__migraphx__( +#include +#include +#include +#include +#include <${include}> + +namespace migraphx { + +${preamble} + +extern "C" { + +MIGRAPHX_GLOBAL void ${kernel}(${params}) +{ + transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) { + ck_gemm_softmax_gemm<${solution}, ${blocks_per_batch}>(xs...); + }); +} + +} + +} // namespace migraphx + +)__migraphx__"; + +// NOLINTNEXTLINE +static const char* const disable_warning_pragma = R"__migraphx__( +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Weverything" +${content} +#pragma clang diagnostic pop +)__migraphx__"; + +template +static std::string ck_disable_warnings(P p) +{ + return interpolate_string(disable_warning_pragma, + {{"content", std::string{p.first, p.second}}}); +} + +static std::unordered_map create_ck_header_strings() +{ + std::unordered_map result; + auto ck_headers = ck::host::GetHeaders(); + + std::transform( + ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto&& p) { + return std::make_pair(p.first, ck_disable_warnings(p.second)); + }); + return result; +} + +static std::vector create_ck_headers() +{ + static const auto& header_strings = create_ck_header_strings(); + std::vector srcs; + std::transform( + header_strings.begin(), header_strings.end(), std::back_inserter(srcs), [&](auto&& p) { + return src_file{fs::path{p.first}, + {p.second.data(), p.second.data() + p.second.size()}}; + }); + return srcs; +} + +static const std::vector& ck_headers() +{ + static const auto& headers = create_ck_headers(); + return headers; +} + +static bool transposed_matrix(const shape& s) { return s.strides().back() != 1; } + +using tuning_entry = std::pair, size_t>; +static std::vector read_tuning(const std::string& s) +{ + if(not fs::exists(s)) + return {}; + return from_value>(from_json_string(read_string(s))); +} + +static float matrix_distance(const shape& x, const shape& y) +{ + if(x.type() != y.type()) + return std::numeric_limits::max(); + if(transposed_matrix(x) != transposed_matrix(y)) + return std::numeric_limits::max(); + auto sum_squared = std::inner_product(x.lens().rbegin(), + x.lens().rbegin() + 2, + y.lens().rbegin(), + 0, + std::plus<>{}, + [](auto a, auto b) { return (a - b) * (a - b); }); + return std::sqrt(sum_squared); +} + +static std::size_t get_tuning_for(const std::vector& inputs) +{ + static auto tuning = read_tuning(string_value_of(MIGRAPHX_CK_TUNING{}, "")); + if(tuning.empty()) + { + std::cout << "*********** Warning: No CK tuning! for config:" << std::endl; + std::cout << " " << inputs[0] << std::endl; + std::cout << " " << inputs[1] << std::endl; + std::cout << " " << inputs[2] << std::endl; + } + auto it = std::find_if( + tuning.begin(), tuning.end(), [&](const auto& p) { return p.first == inputs; }); + if(it == tuning.end()) + { + std::cout << "*********** Warning: CK tuning missing for config!" << std::endl; + std::cout << " " << inputs[0] << std::endl; + std::cout << " " << inputs[1] << std::endl; + std::cout << " " << inputs[2] << std::endl; + std::vector> w; + std::transform(tuning.begin(), tuning.end(), std::back_inserter(w), [&](const auto& p) { + if(inputs.size() < 3 or p.first.size() < 3) + MIGRAPHX_THROW("Invalid CK config"); + auto avg_distance = std::inner_product( + p.first.begin(), + p.first.begin() + 3, + inputs.begin(), + 0.0f, + std::plus<>{}, + [](const auto& x, const auto& y) { return matrix_distance(x, y) / 3.0f; }); + return std::make_pair(avg_distance, p.second); + }); + std::sort(w.begin(), w.end()); + std::size_t default_value = 4; + if(not w.empty()) + default_value = w.front().second; + auto tuning_val = value_of(MIGRAPHX_CK_TUNING_VALUE{}, default_value); + std::cout << "*********** Warning: CK try tuning: " << tuning_val << std::endl; + return tuning_val; + } + return it->second; +} + +struct ck_gemm_softmax_gemm_compiler : compiler +{ + static std::string get_layout(const shape& s) + { + return transposed_matrix(s) ? "ck::tensor_layout::gemm::ColumnMajor" + : "ck::tensor_layout::gemm::RowMajor"; + } + + static ck::host::DataType get_type(const shape& s) + { + if(s.type() == shape::half_type) + return ck::host::DataType::Half; + else if(s.type() == shape::float_type) + return ck::host::DataType::Float; + else if(s.type() == shape::int8_type) + return ck::host::DataType::Int8; + else if(s.type() == shape::int32_type) + return ck::host::DataType::Int32; + MIGRAPHX_THROW("Unsupported ck type"); + } + + template + static std::string ck_tuple(Iterator start, Iterator last, F f) + { + std::vector s; + std::transform(start, last, std::back_inserter(s), f); + return "ck::Tuple<" + join_strings(s, ",") + ">"; + } + + static std::vector adjust_inputs(std::vector inputs, bool& swap_inputs) + { + swap_inputs = false; + auto c_shape = inputs.back(); + if(not transposed_matrix(c_shape)) + return inputs; + std::vector perm(c_shape.lens().size()); + std::iota(perm.begin(), perm.end(), 0); + std::swap(perm[perm.size() - 1], perm[perm.size() - 2]); + std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](shape s) { + return reorder_shape(s, perm); + }); + swap_inputs = true; + return inputs; + } + + static std::size_t get_batch_count(const shape& s) + { + return std::accumulate( + s.lens().rbegin() + 2, s.lens().rend(), std::size_t{1}, std::multiplies()); + } + + static void fold_batch_dims(shape& s) + { + auto lens = s.lens(); + if(lens.size() <= 2) + return; + auto batch_count = get_batch_count(s); + auto m1 = lens.at(lens.size() - 2); + auto m2 = lens.at(lens.size() - 1); + if(transposed_matrix(s)) + s = shape{s.type(), {m1, m2 * batch_count}}; + else + s = shape{s.type(), {m1 * batch_count, m2}}; + } + + static void remove_batch_dims(shape& s) + { + auto lens = s.lens(); + if(lens.size() <= 2) + return; + auto m1 = lens.at(lens.size() - 2); + auto m2 = lens.at(lens.size() - 1); + s = shape{s.type(), {m1, m2}}; + } + + std::vector names() const { return {"ck_gemm_softmax_gemm", "gpu::ck_gemm_softmax_gemm"}; } + + static bool standard_batch(const shape& s) + { + if(s.lens().size() < 3) + return true; + std::vector lens(s.lens().begin(), s.lens().end() - 2); + std::vector strides(s.strides().begin(), s.strides().end() - 2); + auto base = *(s.lens().end() - 2) * *(s.lens().end() - 1); + std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto stride) { + return stride / base; + }); + return shape{s.type(), lens, strides}.standard(); + } + + bool can_fold_batch(const std::vector& inputs) const + { + const auto& b_shape = inputs[1]; + if(std::any_of(inputs.begin() + 2, inputs.end() - 1, [](auto input) { + return not standard_batch(input); + })) + return false; + const auto& b_strides = b_shape.strides(); + return std::all_of( + b_strides.begin(), b_strides.end() - 2, [](auto stride) { return stride == 0; }); + } + + ck::host::device_batched_gemm_softmax_gemm::Problem create_problem(const std::vector& inputs, + const value& v) const + { + const auto& a_shape = inputs[0]; + const auto& b_shape = inputs[1]; + const auto& b1_shape = inputs[2]; + const auto& c_shape = inputs.back(); + + // cppcheck-suppress unreadVariable + auto rank = a_shape.ndim(); + + auto batch_count = get_batch_count(c_shape); + auto m = c_shape.lens()[rank - 2]; + m = can_fold_batch(inputs) ? m * batch_count : m; + auto n = c_shape.lens().back(); + auto k = a_shape.lens().back(); + auto o = c_shape.lens().back(); + + const bool trans_a = transposed_matrix(a_shape); + const bool trans_b = transposed_matrix(b_shape); + const bool trans_b1 = transposed_matrix(b1_shape); + const bool trans_c = transposed_matrix(c_shape); + const auto a_type = get_type(a_shape); + const auto b_type = get_type(b_shape); + const auto b1_type = get_type(b1_shape); + const auto c_type = get_type(c_shape); + const auto scale = 1.0f; + + std::string ck_passthrough = "ck_passthrough"; + std::string cde_op = ck_passthrough; + + /// update params after adding to jitlib + return ck::host::device_batched_gemm_softmax_gemm::Problem{m, + n, + k, + o, + trans_a, + trans_b, + trans_b1, + trans_c, + a_type, + b_type, + b1_type, + c_type, + ck_passthrough, + ck_passthrough, + ck_passthrough, + ck_passthrough, + scale}; + } + + operation compile_op(context& ctx, const std::vector& inputs, const value& v) const + { + const auto& a_shape = inputs[0]; + const auto& b_shape = inputs[1]; + const auto& c_shape = inputs.back(); + /// update for 4-arg lookup? + auto tuning_value = v.get("tuning_value", 4); + if(not v.contains("tuning_value")) + tuning_value = get_tuning_for({a_shape, b_shape, c_shape}); + auto batch_count = get_batch_count(c_shape); + auto problem = create_problem(inputs, v); + + const auto include_header = problem.GetIncludeHeader(); + const auto solutions = problem.GetSolutions(ctx.get_current_device().get_gfx_name()); + const auto& solution = solutions.at(tuning_value); + const auto template_str = solution.template_str; + const auto blocks_per_batch = solution.grid_size; + const auto block_size = solution.block_size; + + hip_compile_options options; + options.additional_src_files = ck_headers(); + auto grid_size = can_fold_batch(inputs) ? blocks_per_batch : batch_count * blocks_per_batch; + options.set_launch_params(v, grid_size * block_size, block_size); + options.inputs = inputs; + options.output = c_shape; + options.kernel_name = v.get("kernel", "ck_gemm_softmax_gemm_kernel"); + options.virtual_inputs = inputs; + if(can_fold_batch(inputs)) + { + auto vinputs = inputs; + fold_batch_dims(vinputs[0]); + remove_batch_dims(vinputs[1]); + std::for_each(vinputs.begin() + 2, vinputs.end(), fold_batch_dims); + options.virtual_inputs = vinputs; + } + + if(v.get("check", false) or enabled(MIGRAPHX_CK_DEBUG{})) + options.params += " -DMIGRAPHX_CK_CHECK=1"; + + auto src = interpolate_string(ck_gemm_softmax_gemm_kernel, + {{"solution", template_str}, + {"include", include_header}, + {"params", enum_params(inputs.size(), "void * private_p")}, + {"args", enum_params(inputs.size(), "private_p")}, + {"blocks_per_batch", to_string(blocks_per_batch)}, + {"preamble", v.get("preamble", std::string{})}, + {"kernel", options.kernel_name}}); + + return compile_hip_code_object(src, options); + } + + value create_settings(instruction_ref ins, const operation& op) const + { + auto v = op.to_value(); + v["kernel"] = "ck_gemm_softmax_gemm_kernel"; + if(not ins->module_inputs().empty()) + { + auto* pm = ins->module_inputs().front(); + v["preamble"] = generate_pointwise(*pm, "post_ck_gemm_softmax_gemm_function") + + "\nMIGRAPHX_LIFT_CLASS(post_ck_gemm_softmax_gemm, post_ck_gemm_softmax_gemm_function);"; + v["post"] = "ck_function_adaptor"; + v["kernel"] = "ck_gemm_softmax_gemm_" + generate_name_from_ops(*pm) + "_kernel"; + } + return v; + } + + compiler_replace + compile(context& ctx, instruction_ref ins, const operation& op, const value& solution) const + { + auto shapes = to_shapes(ins->inputs()); + auto v = create_settings(ins, op); + if(not solution.is_null()) + v["tuning_value"] = solution; + return {compile_op(ctx, shapes, v), + [=](module& m, instruction_ref ins2, const operation& code_object) { + if(enabled(MIGRAPHX_LOG_CK_GEMM{})) + { + std::vector gemm_shapes{ + shapes[0], shapes[1], shapes.back().with_type(shapes[0].type())}; + std::cout << "gpu::ck_gemm_softmax_gemm: " << to_json_string(to_value(gemm_shapes)) + << std::endl; + } + m.replace_instruction(ins2, code_object, ins2->inputs()); + }}; + } + + optional + get_tuning_config(context& ctx, instruction_ref ins, const operation& op, bool exhaustive) const + { + if(not exhaustive and not enabled(MIGRAPHX_TUNE_CK{})) + return nullopt; + tuning_config tc; + auto shapes = to_shapes(ins->inputs()); + auto problem = create_problem(shapes, create_settings(ins, op)); + auto solutions = problem.GetSolutions(ctx.get_current_device().get_gfx_name()); + tc.solutions.resize(solutions.size()); + std::iota(tc.solutions.begin(), tc.solutions.end(), 0); + std::vector gemm_shapes{shapes[0], shapes[1], shapes.back()}; + tc.problem = to_value(gemm_shapes); + return tc; + } +}; + +} // namespace gpu +} // namespace MIGRAPHX_INLINE_NS +} // namespace migraphx diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp new file mode 100644 index 00000000000..79b2513a511 --- /dev/null +++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp @@ -0,0 +1,72 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP +#define MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP + +#include +#include +#include +#include +#include +#include + +namespace migraphx { + +// In CK, the B matrix is ordered as N,K instead of K,N +template +constexpr auto ck_transposeb_dims(Dims dims) +{ + return unpack(dims, [](auto k, auto n) { return make_const_array(n, k); }); +} + +template +using ck_transposeb = decltype(make_shape(ck_transposeb_dims(get_shape_c{}.lens), + ck_transposeb_dims(get_shape_c{}.strides))); + +template +__device__ void ck_gemm_softmax_gemm_matrix(C c, A a, B b, B1 b1) +{ + constexpr auto desc = G::make_descriptor(to_ck_tensor(), + to_ck_tensor>(), + to_ck_tensor>(), + to_ck_tensor()); + + static_assert(desc.IsValid(), "Invalid ck gemm."); + + G::Run(desc, + to_ck_const_pointer(a.data()), + to_ck_const_pointer(b.data()), + to_ck_const_pointer(b1.data()), + to_ck_pointer(c.data())); +} + +template +__device__ void ck_gemm_softmax_gemm(Ts... xs) +{ + gemm_batch_args(make_index(), _c, xs...)( + [](auto... ys) { ck_gemm_softmax_gemm_matrix(ys...); }); +} + +} // namespace migraphx +#endif diff --git a/test/onnx/gemm_softmax_gemm_test.onnx b/test/onnx/gemm_softmax_gemm_test.onnx new file mode 100644 index 0000000000000000000000000000000000000000..7c290563a2d54eca272913cdc2209f665aa28493 GIT binary patch literal 340 zcmZ{fyAFat5Jla^$9OeL)<B}odf)e@_VXA!d?9+-MaGe!B#%qWGRg)u zKv&&mCP;?iM>vKdwy1#^RRQM6;8fONjV1?IuCN*v|Jog!7%>|TV_w2 e{hOFzOb`T4%#4BR2hNMTEmbXGO4_o=?db~>Qcvvw literal 0 HcmV?d00001 diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py index 2fb2b4082e3..490bc921bec 100644 --- a/test/onnx/gen_onnx.py +++ b/test/onnx/gen_onnx.py @@ -7799,3 +7799,82 @@ def where_mixed_test(): outputs=['z']) return ([node], [c, x, y], [z]) + + +@onnx_test() +def gemm_softmax_gemm_test(): + a = helper.make_tensor_value_info('a', TensorProto.FLOAT16, [1, 1]) + b = helper.make_tensor_value_info('b', TensorProto.FLOAT16, [1, 1]) + # c = helper.make_tensor_value_info('c', TensorProto.FLOAT16, [1, 1]) + b1 = helper.make_tensor_value_info('b1', TensorProto.FLOAT16, [1, 1]) + # bias = helper.make_tensor_value_info('bias', TensorProto.FLOAT16, [1, 1]) + out = helper.make_tensor_value_info('out', TensorProto.FLOAT16, [1, 1]) + + scale_array = np.array([1]) + bias_array = np.array([0]) + + scale_tensor = helper.make_tensor(name='scale', + data_type=TensorProto.FLOAT16, + dims=[1, 1], + vals=[1]) + bias_tensor = helper.make_tensor(name='bias', + data_type=TensorProto.FLOAT16, + dims=[1, 1], + vals=[0]) + + gemm1 = onnx.helper.make_node('MatMul', + inputs=['a', 'b'], + outputs=['gemm1_out']) + mul1 = onnx.helper.make_node('Mul', + inputs=['gemm1_out', 'scale'], + outputs=['mul1_out']) + add1 = onnx.helper.make_node('Add', + inputs=['mul1_out', 'bias'], + outputs=['add1_out']) + softmax = onnx.helper.make_node('Softmax', + inputs=['add1_out'], + outputs=['softmax_out']) + gemm2 = onnx.helper.make_node('MatMul', + inputs=['softmax_out', 'b1'], + outputs=['out']) + + + return ([gemm1, mul1, add1, softmax, gemm2], [a, b, b1], [out], [scale_tensor, bias_tensor]) + + + +@onnx_test() +def old_gemm_softmax_gemm_test(): + a = helper.make_tensor_value_info('a', TensorProto.FLOAT16, [1, 1]) + b = helper.make_tensor_value_info('b', TensorProto.FLOAT16, [1, 1]) + c = helper.make_tensor_value_info('c', TensorProto.FLOAT16, [1, 1]) + b1 = helper.make_tensor_value_info('b1', TensorProto.FLOAT16, [1, 1]) + bias = helper.make_tensor_value_info('bias', TensorProto.FLOAT16, [1, 1]) + out = helper.make_tensor_value_info('out', TensorProto.FLOAT16, [1, 1]) + + scale_array = np.array([(1/8)]) + + scale_tensor = helper.make_tensor('scale', + TensorProto.FLOAT16, + [1, 1], + [1]) + + gemm1 = onnx.helper.make_node('MatMul', + inputs=['a', 'b'], + outputs=['gemm1_out']) + mul1 = onnx.helper.make_node('Mul', + inputs=['gemm1_out', 'scale'], + outputs=['mul1_out']) + add1 = onnx.helper.make_node('Add', + inputs=['mul1_out', 'c'], + outputs=['add1_out']) + softmax = onnx.helper.make_node('Softmax', + inputs=['add1_out'], + outputs=['softmax_out']) + gemm2 = onnx.helper.make_node('MatMul', + inputs=['softmax_out', 'b1'], + outputs=['out']) + + + return ([gemm1, mul1, add1, softmax, gemm2], [a, b, c, b1, bias], [out], [scale_tensor]) + diff --git a/test/onnx/old_gemm_softmax_gemm_test.onnx b/test/onnx/old_gemm_softmax_gemm_test.onnx new file mode 100644 index 00000000000..91032e3b6ea --- /dev/null +++ b/test/onnx/old_gemm_softmax_gemm_test.onnx @@ -0,0 +1,46 @@ +old_gemm_softmax_gemm_test:Ð + +a +b gemm1_out"MatMul +! + gemm1_out +scalemul1_out"Mul + +mul1_out +cadd1_out"Add + +add1_out softmax_out"Softmax + + softmax_out +b1out"MatMulold_gemm_softmax_gemm_test* +*BscaleZ +a +  + + +Z +b +  + + +Z +c +  + + +Z +b1 +  + + +Z +bias +  + + +b +out +  + + +B \ No newline at end of file diff --git a/test/verify/ck_gemm_softmax_gemm.cpp b/test/verify/ck_gemm_softmax_gemm.cpp new file mode 100644 index 00000000000..516a3692c39 --- /dev/null +++ b/test/verify/ck_gemm_softmax_gemm.cpp @@ -0,0 +1,59 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "verify_program.hpp" +#include +#include +#include + +struct ck_gemm_softmax_gemm : verify_program +{ + migraphx::program create_program() const + { + migraphx::program p; + auto* mm = p.get_main_module(); + migraphx::shape m1_shape{migraphx::shape::half_type, {1, 12, 256, 256}}; + migraphx::shape m2_shape{migraphx::shape::half_type, {1, 12, 256, 256}}; + auto m2_elements = 1 * 12 * 256 * 256; + auto a = mm->add_parameter("1", m1_shape); + auto b = mm->add_parameter("2", m1_shape); + auto b1 = mm->add_parameter("3", m1_shape); + auto c = mm->add_parameter("4", m1_shape); + std::vector eights(m2_elements, 0.125); + auto eight = mm->add_literal(migraphx::literal{m2_shape, eights}); + std::vector zeros(m2_elements, 0); + auto zero = mm->add_literal(migraphx::literal{m2_shape, zeros}); + std::vector ones(m2_elements, 1); + auto one = mm->add_literal(migraphx::literal{m2_shape, ones}); + + b = mm->add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), b); + auto gemm1 = mm->add_instruction(migraphx::make_op("dot"), a, b); + auto scale = mm->add_instruction(migraphx::make_op("mul"), gemm1, eight); + auto bias = mm->add_instruction(migraphx::make_op("add"), scale, zero); + auto softmax = mm->add_instruction(migraphx::make_op("softmax", {{"axis", -1}}), bias); + mm->add_instruction(migraphx::make_op("dot"), softmax, b1); + + return p; + } +}; From 0a463c1e492dba4a7e1542aeb080589af902b060 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Tue, 29 Aug 2023 17:28:55 -0700 Subject: [PATCH 03/38] Formatting --- src/targets/gpu/fuse_ck.cpp | 10 +-- src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp | 72 +++++++++++--------- test/verify/ck_gemm_softmax_gemm.cpp | 14 ++-- 3 files changed, 48 insertions(+), 48 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index 4d9d574bc2d..da33d2b4193 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -73,7 +73,6 @@ struct ck_gemm }; MIGRAPHX_REGISTER_OP(ck_gemm); - struct ck_gemm_softmax_gemm { operation op = make_op("dot"); @@ -107,10 +106,7 @@ struct ck_gemm_softmax_gemm return op.compute_shape({op.compute_shape({a, b}), b1}); } - static bool is_ck_supported_type(shape::type_t t) - { - return contains({shape::half_type}, t); - } + static bool is_ck_supported_type(shape::type_t t) { return contains({shape::half_type}, t); } }; MIGRAPHX_REGISTER_OP(ck_gemm_softmax_gemm); @@ -140,7 +136,7 @@ MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins) // Skipping GEMMs with a K dimension greater than 2048 is a course-grained strategy // to avoid poor-performing GEMM kernels from CK // To-do: Investigate a more precise strategy - return true;//k <= 2048; + return true; // k <= 2048; } struct find_ck_gemm_softmax_gemm @@ -163,7 +159,7 @@ struct find_ck_gemm_softmax_gemm // if (not ck_gemm_softmax_gemm::is_ck_supported_type(gemm1_ins->get_shape().type())) // return; - + auto inputs = gemm1_ins->inputs(); // A, B inputs.push_back(gemm2_ins->inputs().back()); // B1 diff --git a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp index 4598a24aea6..0a14de626f8 100644 --- a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp @@ -266,7 +266,10 @@ struct ck_gemm_softmax_gemm_compiler : compiler s = shape{s.type(), {m1, m2}}; } - std::vector names() const { return {"ck_gemm_softmax_gemm", "gpu::ck_gemm_softmax_gemm"}; } + std::vector names() const + { + return {"ck_gemm_softmax_gemm", "gpu::ck_gemm_softmax_gemm"}; + } static bool standard_batch(const shape& s) { @@ -293,13 +296,13 @@ struct ck_gemm_softmax_gemm_compiler : compiler b_strides.begin(), b_strides.end() - 2, [](auto stride) { return stride == 0; }); } - ck::host::device_batched_gemm_softmax_gemm::Problem create_problem(const std::vector& inputs, - const value& v) const + ck::host::device_batched_gemm_softmax_gemm::Problem + create_problem(const std::vector& inputs, const value& v) const { - const auto& a_shape = inputs[0]; - const auto& b_shape = inputs[1]; + const auto& a_shape = inputs[0]; + const auto& b_shape = inputs[1]; const auto& b1_shape = inputs[2]; - const auto& c_shape = inputs.back(); + const auto& c_shape = inputs.back(); // cppcheck-suppress unreadVariable auto rank = a_shape.ndim(); @@ -311,37 +314,37 @@ struct ck_gemm_softmax_gemm_compiler : compiler auto k = a_shape.lens().back(); auto o = c_shape.lens().back(); - const bool trans_a = transposed_matrix(a_shape); - const bool trans_b = transposed_matrix(b_shape); + const bool trans_a = transposed_matrix(a_shape); + const bool trans_b = transposed_matrix(b_shape); const bool trans_b1 = transposed_matrix(b1_shape); - const bool trans_c = transposed_matrix(c_shape); - const auto a_type = get_type(a_shape); - const auto b_type = get_type(b_shape); + const bool trans_c = transposed_matrix(c_shape); + const auto a_type = get_type(a_shape); + const auto b_type = get_type(b_shape); const auto b1_type = get_type(b1_shape); - const auto c_type = get_type(c_shape); - const auto scale = 1.0f; + const auto c_type = get_type(c_shape); + const auto scale = 1.0f; std::string ck_passthrough = "ck_passthrough"; std::string cde_op = ck_passthrough; - + /// update params after adding to jitlib return ck::host::device_batched_gemm_softmax_gemm::Problem{m, - n, - k, - o, - trans_a, - trans_b, - trans_b1, - trans_c, - a_type, - b_type, - b1_type, - c_type, - ck_passthrough, - ck_passthrough, - ck_passthrough, - ck_passthrough, - scale}; + n, + k, + o, + trans_a, + trans_b, + trans_b1, + trans_c, + a_type, + b_type, + b1_type, + c_type, + ck_passthrough, + ck_passthrough, + ck_passthrough, + ck_passthrough, + scale}; } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const @@ -350,7 +353,7 @@ struct ck_gemm_softmax_gemm_compiler : compiler const auto& b_shape = inputs[1]; const auto& c_shape = inputs.back(); /// update for 4-arg lookup? - auto tuning_value = v.get("tuning_value", 4); + auto tuning_value = v.get("tuning_value", 4); if(not v.contains("tuning_value")) tuning_value = get_tuning_for({a_shape, b_shape, c_shape}); auto batch_count = get_batch_count(c_shape); @@ -403,7 +406,8 @@ struct ck_gemm_softmax_gemm_compiler : compiler { auto* pm = ins->module_inputs().front(); v["preamble"] = generate_pointwise(*pm, "post_ck_gemm_softmax_gemm_function") + - "\nMIGRAPHX_LIFT_CLASS(post_ck_gemm_softmax_gemm, post_ck_gemm_softmax_gemm_function);"; + "\nMIGRAPHX_LIFT_CLASS(post_ck_gemm_softmax_gemm, " + "post_ck_gemm_softmax_gemm_function);"; v["post"] = "ck_function_adaptor"; v["kernel"] = "ck_gemm_softmax_gemm_" + generate_name_from_ops(*pm) + "_kernel"; } @@ -423,8 +427,8 @@ struct ck_gemm_softmax_gemm_compiler : compiler { std::vector gemm_shapes{ shapes[0], shapes[1], shapes.back().with_type(shapes[0].type())}; - std::cout << "gpu::ck_gemm_softmax_gemm: " << to_json_string(to_value(gemm_shapes)) - << std::endl; + std::cout << "gpu::ck_gemm_softmax_gemm: " + << to_json_string(to_value(gemm_shapes)) << std::endl; } m.replace_instruction(ins2, code_object, ins2->inputs()); }}; diff --git a/test/verify/ck_gemm_softmax_gemm.cpp b/test/verify/ck_gemm_softmax_gemm.cpp index 516a3692c39..1759e36f4bd 100644 --- a/test/verify/ck_gemm_softmax_gemm.cpp +++ b/test/verify/ck_gemm_softmax_gemm.cpp @@ -36,10 +36,10 @@ struct ck_gemm_softmax_gemm : verify_program migraphx::shape m1_shape{migraphx::shape::half_type, {1, 12, 256, 256}}; migraphx::shape m2_shape{migraphx::shape::half_type, {1, 12, 256, 256}}; auto m2_elements = 1 * 12 * 256 * 256; - auto a = mm->add_parameter("1", m1_shape); - auto b = mm->add_parameter("2", m1_shape); - auto b1 = mm->add_parameter("3", m1_shape); - auto c = mm->add_parameter("4", m1_shape); + auto a = mm->add_parameter("1", m1_shape); + auto b = mm->add_parameter("2", m1_shape); + auto b1 = mm->add_parameter("3", m1_shape); + auto c = mm->add_parameter("4", m1_shape); std::vector eights(m2_elements, 0.125); auto eight = mm->add_literal(migraphx::literal{m2_shape, eights}); std::vector zeros(m2_elements, 0); @@ -48,9 +48,9 @@ struct ck_gemm_softmax_gemm : verify_program auto one = mm->add_literal(migraphx::literal{m2_shape, ones}); b = mm->add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), b); - auto gemm1 = mm->add_instruction(migraphx::make_op("dot"), a, b); - auto scale = mm->add_instruction(migraphx::make_op("mul"), gemm1, eight); - auto bias = mm->add_instruction(migraphx::make_op("add"), scale, zero); + auto gemm1 = mm->add_instruction(migraphx::make_op("dot"), a, b); + auto scale = mm->add_instruction(migraphx::make_op("mul"), gemm1, eight); + auto bias = mm->add_instruction(migraphx::make_op("add"), scale, zero); auto softmax = mm->add_instruction(migraphx::make_op("softmax", {{"axis", -1}}), bias); mm->add_instruction(migraphx::make_op("dot"), softmax, b1); From 3793980505236af9129c2215cd3c7c7693368691 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Thu, 21 Sep 2023 23:12:49 +0000 Subject: [PATCH 04/38] Move fuse_gsg to fuse_ck and fix bugs --- src/targets/gpu/fuse_ck.cpp | 104 ++++++++++++------ src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp | 27 +++-- .../migraphx/kernels/ck_gemm_softmax_gemm.hpp | 25 ++++- 3 files changed, 107 insertions(+), 49 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index da33d2b4193..b4456970424 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -76,11 +76,12 @@ MIGRAPHX_REGISTER_OP(ck_gemm); struct ck_gemm_softmax_gemm { operation op = make_op("dot"); + double scale = 1.0; template static auto reflect(Self& self, F f) { - return pack(f(self.op, "op")); + return pack(f(self.op, "op"), f(self.scale, "scale")); } std::string name() const { return "gpu::ck_gemm_softmax_gemm"; } @@ -91,7 +92,7 @@ struct ck_gemm_softmax_gemm MIGRAPHX_THROW("Invalid shape for ck_gemm_softmax_gemm"); } - shape compute_shape(std::vector inputs, const std::vector& mods) const + shape compute_shape(std::vector inputs, const std::vector&) const { check_shapes{inputs, *this}.same_ndims(); if(inputs.size() < 2) @@ -136,38 +137,9 @@ MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins) // Skipping GEMMs with a K dimension greater than 2048 is a course-grained strategy // to avoid poor-performing GEMM kernels from CK // To-do: Investigate a more precise strategy - return true; // k <= 2048; + return k <= 2048; } -struct find_ck_gemm_softmax_gemm -{ - auto matcher() const - { - auto gemm1 = - match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); - auto mul = match::name("mul")(match::any_of[match::inputs()](gemm1)).bind("scale"); - auto softmax = match::name("softmax")(match::any_of[match::inputs()](mul)).bind("softmax"); - return match::name("dot")(is_ck_gemm().bind("gemm2"))( - match::any_of[match::inputs()](softmax)); - } - - void apply(module_pass_manager& mpm, const match::matcher_result& r) const - { - auto ins = r.result; - auto gemm2_ins = r.instructions["gemm2"]; - auto gemm1_ins = r.instructions["gemm1"]; - - // if (not ck_gemm_softmax_gemm::is_ck_supported_type(gemm1_ins->get_shape().type())) - // return; - - auto inputs = gemm1_ins->inputs(); // A, B - inputs.push_back(gemm2_ins->inputs().back()); // B1 - - mpm.get_module().replace_instruction( - ins, ck_gemm_softmax_gemm{gemm2_ins->get_operator()}, inputs); - } -}; - struct find_ck_gemm_pointwise { // Find a gemm followed by a pointwise operation. @@ -231,6 +203,74 @@ struct find_ck_gemm } }; +static bool is_mul_module(const module& m) +{ + std::vector result; + for(auto& ins : m) + { + if(starts_with(ins.name(), "@")) + continue; + if(contains({"multibroadcast", "contiguous"}, ins.name())) + continue; + if(ins.name() == "pointwise") + { + return is_mul_module(*ins.module_inputs().front()); + } + else if(ins.name() == "mul") + { + return true; + } + } + return false; +} + +struct find_ck_gemm_softmax_gemm +{ + auto matcher() const + { + auto gemm1 = + match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); + auto mul = match::name("pointwise")(match::any_of[match::inputs()](gemm1)).bind("scale"); + auto softmax = match::name("softmax")(match::any_of[match::inputs()](mul)).bind("softmax"); + return match::name("dot")(is_ck_gemm().bind("gemm2"))( + match::any_of[match::inputs()](softmax)); + } + + void apply(module_pass_manager& mpm, const match::matcher_result& r) const + { + auto ins = r.result; + auto gemm2_ins = r.instructions["gemm2"]; + auto gemm1_ins = r.instructions["gemm1"]; + auto scale_ins = r.instructions["scale"]; + + if (scale_ins->module_inputs().size() != 1 or not is_mul_module(*scale_ins->module_inputs().front())) + return; + if (not ck_gemm_softmax_gemm::is_ck_supported_type(gemm1_ins->get_shape().type())) + return; + + double scale = 1.0; + for (auto& in: scale_ins->inputs()) + { + if (in->can_eval()) + { + in->get_literal().visit([&](const auto s) { + if (std::all_of( + s.begin() + 1, s.end(), [&](auto v) { return float_equal(v, s.front()); })) + scale = s.front(); + else + return; + }); + } + } + + auto inputs = gemm1_ins->inputs(); // A, B + inputs.push_back(gemm2_ins->inputs().back()); // B1 + + mpm.get_module().replace_instruction( + ins, ck_gemm_softmax_gemm{gemm2_ins->get_operator(), scale}, inputs); + } +}; + } // namespace void fuse_ck::apply(module_pass_manager& mpm) const diff --git a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp index 0a14de626f8..53afb42c20c 100644 --- a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp @@ -58,6 +58,8 @@ static const char* const ck_gemm_softmax_gemm_kernel = R"__migraphx__( #include #include #include +#include +#include #include <${include}> namespace migraphx { @@ -69,7 +71,8 @@ extern "C" { MIGRAPHX_GLOBAL void ${kernel}(${params}) { transform_args(make_tensors(), rotate_last())(${args})([](auto... xs) { - ck_gemm_softmax_gemm<${solution}, ${blocks_per_batch}>(xs...); + auto settings = make_ck_gemm_softmax_gemm_settings(MIGRAPHX_MAKE_CONSTANT(float{SCALE})); + ck_gemm_softmax_gemm<${solution}, ${blocks_per_batch}>(settings, xs...); }); } @@ -158,6 +161,7 @@ static std::size_t get_tuning_for(const std::vector& inputs) std::cout << " " << inputs[0] << std::endl; std::cout << " " << inputs[1] << std::endl; std::cout << " " << inputs[2] << std::endl; + std::cout << " " << inputs[3] << std::endl; } auto it = std::find_if( tuning.begin(), tuning.end(), [&](const auto& p) { return p.first == inputs; }); @@ -167,6 +171,7 @@ static std::size_t get_tuning_for(const std::vector& inputs) std::cout << " " << inputs[0] << std::endl; std::cout << " " << inputs[1] << std::endl; std::cout << " " << inputs[2] << std::endl; + std::cout << " " << inputs[3] << std::endl; std::vector> w; std::transform(tuning.begin(), tuning.end(), std::back_inserter(w), [&](const auto& p) { if(inputs.size() < 3 or p.first.size() < 3) @@ -181,7 +186,7 @@ static std::size_t get_tuning_for(const std::vector& inputs) return std::make_pair(avg_distance, p.second); }); std::sort(w.begin(), w.end()); - std::size_t default_value = 4; + std::size_t default_value = 5; if(not w.empty()) default_value = w.front().second; auto tuning_val = value_of(MIGRAPHX_CK_TUNING_VALUE{}, default_value); @@ -322,12 +327,8 @@ struct ck_gemm_softmax_gemm_compiler : compiler const auto b_type = get_type(b_shape); const auto b1_type = get_type(b1_shape); const auto c_type = get_type(c_shape); - const auto scale = 1.0f; std::string ck_passthrough = "ck_passthrough"; - std::string cde_op = ck_passthrough; - - /// update params after adding to jitlib return ck::host::device_batched_gemm_softmax_gemm::Problem{m, n, k, @@ -343,19 +344,18 @@ struct ck_gemm_softmax_gemm_compiler : compiler ck_passthrough, ck_passthrough, ck_passthrough, - ck_passthrough, - scale}; + ck_passthrough}; } operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { const auto& a_shape = inputs[0]; const auto& b_shape = inputs[1]; + const auto& b1_shape = inputs[2]; const auto& c_shape = inputs.back(); - /// update for 4-arg lookup? auto tuning_value = v.get("tuning_value", 4); if(not v.contains("tuning_value")) - tuning_value = get_tuning_for({a_shape, b_shape, c_shape}); + tuning_value = get_tuning_for({a_shape, b_shape, b1_shape, c_shape}); auto batch_count = get_batch_count(c_shape); auto problem = create_problem(inputs, v); @@ -386,6 +386,11 @@ struct ck_gemm_softmax_gemm_compiler : compiler if(v.get("check", false) or enabled(MIGRAPHX_CK_DEBUG{})) options.params += " -DMIGRAPHX_CK_CHECK=1"; + // scale + assert(v.contains("scale")); + auto scale = v.at("scale").to(); + options.params += " -DSCALE=" + std::to_string(scale); + auto src = interpolate_string(ck_gemm_softmax_gemm_kernel, {{"solution", template_str}, {"include", include_header}, @@ -394,7 +399,7 @@ struct ck_gemm_softmax_gemm_compiler : compiler {"blocks_per_batch", to_string(blocks_per_batch)}, {"preamble", v.get("preamble", std::string{})}, {"kernel", options.kernel_name}}); - + return compile_hip_code_object(src, options); } diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp index 79b2513a511..abe62473702 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp @@ -44,8 +44,20 @@ template using ck_transposeb = decltype(make_shape(ck_transposeb_dims(get_shape_c{}.lens), ck_transposeb_dims(get_shape_c{}.strides))); -template -__device__ void ck_gemm_softmax_gemm_matrix(C c, A a, B b, B1 b1) +template +struct ck_gemm_softmax_gemm_settings +{ + T scale{}; +}; + +template +constexpr ck_gemm_softmax_gemm_settings make_ck_gemm_softmax_gemm_settings(Ts... xs) +{ + return {xs...}; +} + +template +__device__ void ck_gemm_softmax_gemm_matrix(C c, A a, B b, B1 b1, Settings s) { constexpr auto desc = G::make_descriptor(to_ck_tensor(), to_ck_tensor>(), @@ -53,19 +65,20 @@ __device__ void ck_gemm_softmax_gemm_matrix(C c, A a, B b, B1 b1) to_ck_tensor()); static_assert(desc.IsValid(), "Invalid ck gemm."); - + const float scale = s.scale; G::Run(desc, + scale, to_ck_const_pointer(a.data()), to_ck_const_pointer(b.data()), to_ck_const_pointer(b1.data()), to_ck_pointer(c.data())); } -template -__device__ void ck_gemm_softmax_gemm(Ts... xs) +template +__device__ void ck_gemm_softmax_gemm(Settings s, Ts... xs) { gemm_batch_args(make_index(), _c, xs...)( - [](auto... ys) { ck_gemm_softmax_gemm_matrix(ys...); }); + [&](auto... ys) { ck_gemm_softmax_gemm_matrix(ys..., s); }); } } // namespace migraphx From c393f233ea359c722c61dde9b8a3bfade5798765 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Thu, 21 Sep 2023 23:13:00 +0000 Subject: [PATCH 05/38] Formatting --- src/targets/gpu/fuse_ck.cpp | 22 +++++++++++--------- src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp | 4 ++-- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index b4456970424..841e95eb2ad 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -218,7 +218,7 @@ static bool is_mul_module(const module& m) } else if(ins.name() == "mul") { - return true; + return true; } } return false; @@ -230,7 +230,7 @@ struct find_ck_gemm_softmax_gemm { auto gemm1 = match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); - auto mul = match::name("pointwise")(match::any_of[match::inputs()](gemm1)).bind("scale"); + auto mul = match::name("pointwise")(match::any_of[match::inputs()](gemm1)).bind("scale"); auto softmax = match::name("softmax")(match::any_of[match::inputs()](mul)).bind("softmax"); return match::name("dot")(is_ck_gemm().bind("gemm2"))( match::any_of[match::inputs()](softmax)); @@ -243,21 +243,23 @@ struct find_ck_gemm_softmax_gemm auto gemm1_ins = r.instructions["gemm1"]; auto scale_ins = r.instructions["scale"]; - if (scale_ins->module_inputs().size() != 1 or not is_mul_module(*scale_ins->module_inputs().front())) + if(scale_ins->module_inputs().size() != 1 or + not is_mul_module(*scale_ins->module_inputs().front())) return; - if (not ck_gemm_softmax_gemm::is_ck_supported_type(gemm1_ins->get_shape().type())) + if(not ck_gemm_softmax_gemm::is_ck_supported_type(gemm1_ins->get_shape().type())) return; - + double scale = 1.0; - for (auto& in: scale_ins->inputs()) + for(auto& in : scale_ins->inputs()) { - if (in->can_eval()) + if(in->can_eval()) { in->get_literal().visit([&](const auto s) { - if (std::all_of( - s.begin() + 1, s.end(), [&](auto v) { return float_equal(v, s.front()); })) + if(std::all_of(s.begin() + 1, s.end(), [&](auto v) { + return float_equal(v, s.front()); + })) scale = s.front(); - else + else return; }); } diff --git a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp index 53afb42c20c..e2e24f74dfa 100644 --- a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp @@ -352,7 +352,7 @@ struct ck_gemm_softmax_gemm_compiler : compiler const auto& a_shape = inputs[0]; const auto& b_shape = inputs[1]; const auto& b1_shape = inputs[2]; - const auto& c_shape = inputs.back(); + const auto& c_shape = inputs.back(); auto tuning_value = v.get("tuning_value", 4); if(not v.contains("tuning_value")) tuning_value = get_tuning_for({a_shape, b_shape, b1_shape, c_shape}); @@ -399,7 +399,7 @@ struct ck_gemm_softmax_gemm_compiler : compiler {"blocks_per_batch", to_string(blocks_per_batch)}, {"preamble", v.get("preamble", std::string{})}, {"kernel", options.kernel_name}}); - + return compile_hip_code_object(src, options); } From 59993d98230aec4e38db259e2db9ddd60f46579e Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Fri, 22 Sep 2023 23:35:13 +0000 Subject: [PATCH 06/38] Update CK commit hash --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e8994165133..00268e058de 100755 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,4 @@ ROCmSoftwarePlatform/half@rocm-5.6.0 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On -# ROCmSoftwarePlatform/composable_kernel@5172ec5280f14974beee2acf1af1db3b2670244c -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On +ROCmSoftwarePlatform/composable_kernel@11cab2d533ff155948812f6877d8123bdd53c338 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On From d1d48bdc1ffa2b2f05ecddd3724bc6d9944469ad Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Tue, 26 Sep 2023 21:15:33 +0000 Subject: [PATCH 07/38] Cleanup --- .../migraphx/kernels/ck_gemm_softmax_gemm.hpp | 4 +- test/onnx/gen_onnx.py | 79 ------------------- test/onnx/old_gemm_softmax_gemm_test.onnx | 46 ----------- 3 files changed, 2 insertions(+), 127 deletions(-) delete mode 100644 test/onnx/old_gemm_softmax_gemm_test.onnx diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp index abe62473702..6cc67b017a4 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp @@ -65,9 +65,9 @@ __device__ void ck_gemm_softmax_gemm_matrix(C c, A a, B b, B1 b1, Settings s) to_ck_tensor()); static_assert(desc.IsValid(), "Invalid ck gemm."); - const float scale = s.scale; + G::Run(desc, - scale, + s.scale, to_ck_const_pointer(a.data()), to_ck_const_pointer(b.data()), to_ck_const_pointer(b1.data()), diff --git a/test/onnx/gen_onnx.py b/test/onnx/gen_onnx.py index 490bc921bec..2fb2b4082e3 100644 --- a/test/onnx/gen_onnx.py +++ b/test/onnx/gen_onnx.py @@ -7799,82 +7799,3 @@ def where_mixed_test(): outputs=['z']) return ([node], [c, x, y], [z]) - - -@onnx_test() -def gemm_softmax_gemm_test(): - a = helper.make_tensor_value_info('a', TensorProto.FLOAT16, [1, 1]) - b = helper.make_tensor_value_info('b', TensorProto.FLOAT16, [1, 1]) - # c = helper.make_tensor_value_info('c', TensorProto.FLOAT16, [1, 1]) - b1 = helper.make_tensor_value_info('b1', TensorProto.FLOAT16, [1, 1]) - # bias = helper.make_tensor_value_info('bias', TensorProto.FLOAT16, [1, 1]) - out = helper.make_tensor_value_info('out', TensorProto.FLOAT16, [1, 1]) - - scale_array = np.array([1]) - bias_array = np.array([0]) - - scale_tensor = helper.make_tensor(name='scale', - data_type=TensorProto.FLOAT16, - dims=[1, 1], - vals=[1]) - bias_tensor = helper.make_tensor(name='bias', - data_type=TensorProto.FLOAT16, - dims=[1, 1], - vals=[0]) - - gemm1 = onnx.helper.make_node('MatMul', - inputs=['a', 'b'], - outputs=['gemm1_out']) - mul1 = onnx.helper.make_node('Mul', - inputs=['gemm1_out', 'scale'], - outputs=['mul1_out']) - add1 = onnx.helper.make_node('Add', - inputs=['mul1_out', 'bias'], - outputs=['add1_out']) - softmax = onnx.helper.make_node('Softmax', - inputs=['add1_out'], - outputs=['softmax_out']) - gemm2 = onnx.helper.make_node('MatMul', - inputs=['softmax_out', 'b1'], - outputs=['out']) - - - return ([gemm1, mul1, add1, softmax, gemm2], [a, b, b1], [out], [scale_tensor, bias_tensor]) - - - -@onnx_test() -def old_gemm_softmax_gemm_test(): - a = helper.make_tensor_value_info('a', TensorProto.FLOAT16, [1, 1]) - b = helper.make_tensor_value_info('b', TensorProto.FLOAT16, [1, 1]) - c = helper.make_tensor_value_info('c', TensorProto.FLOAT16, [1, 1]) - b1 = helper.make_tensor_value_info('b1', TensorProto.FLOAT16, [1, 1]) - bias = helper.make_tensor_value_info('bias', TensorProto.FLOAT16, [1, 1]) - out = helper.make_tensor_value_info('out', TensorProto.FLOAT16, [1, 1]) - - scale_array = np.array([(1/8)]) - - scale_tensor = helper.make_tensor('scale', - TensorProto.FLOAT16, - [1, 1], - [1]) - - gemm1 = onnx.helper.make_node('MatMul', - inputs=['a', 'b'], - outputs=['gemm1_out']) - mul1 = onnx.helper.make_node('Mul', - inputs=['gemm1_out', 'scale'], - outputs=['mul1_out']) - add1 = onnx.helper.make_node('Add', - inputs=['mul1_out', 'c'], - outputs=['add1_out']) - softmax = onnx.helper.make_node('Softmax', - inputs=['add1_out'], - outputs=['softmax_out']) - gemm2 = onnx.helper.make_node('MatMul', - inputs=['softmax_out', 'b1'], - outputs=['out']) - - - return ([gemm1, mul1, add1, softmax, gemm2], [a, b, c, b1, bias], [out], [scale_tensor]) - diff --git a/test/onnx/old_gemm_softmax_gemm_test.onnx b/test/onnx/old_gemm_softmax_gemm_test.onnx deleted file mode 100644 index 91032e3b6ea..00000000000 --- a/test/onnx/old_gemm_softmax_gemm_test.onnx +++ /dev/null @@ -1,46 +0,0 @@ -old_gemm_softmax_gemm_test:Ð - -a -b gemm1_out"MatMul -! - gemm1_out -scalemul1_out"Mul - -mul1_out -cadd1_out"Add - -add1_out softmax_out"Softmax - - softmax_out -b1out"MatMulold_gemm_softmax_gemm_test* -*BscaleZ -a -  - - -Z -b -  - - -Z -c -  - - -Z -b1 -  - - -Z -bias -  - - -b -out -  - - -B \ No newline at end of file From e781c38ab1f1ff253ca015831f00460e784d9309 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Tue, 26 Sep 2023 21:15:49 +0000 Subject: [PATCH 08/38] Formatting --- .../kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp index 6cc67b017a4..6a8acc503d6 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp @@ -65,7 +65,7 @@ __device__ void ck_gemm_softmax_gemm_matrix(C c, A a, B b, B1 b1, Settings s) to_ck_tensor()); static_assert(desc.IsValid(), "Invalid ck gemm."); - + G::Run(desc, s.scale, to_ck_const_pointer(a.data()), From 59a0b0ce7b84054a86bd5c3f93569a7727262a8e Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 27 Sep 2023 17:37:09 +0000 Subject: [PATCH 09/38] Format and cppcheck --- src/targets/gpu/fuse_ck.cpp | 4 ++-- src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index 841e95eb2ad..309070ebe34 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -95,8 +95,8 @@ struct ck_gemm_softmax_gemm shape compute_shape(std::vector inputs, const std::vector&) const { check_shapes{inputs, *this}.same_ndims(); - if(inputs.size() < 2) - MIGRAPHX_THROW("should have at least two inputs."); + if(inputs.size() < 3) + MIGRAPHX_THROW("Expected 3 inputs but got " + to_string(inputs.size())); auto a = inputs[0]; auto b = inputs[1]; auto b1 = inputs[2]; diff --git a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp index e2e24f74dfa..9bae04c7e3e 100644 --- a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp @@ -349,11 +349,11 @@ struct ck_gemm_softmax_gemm_compiler : compiler operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - const auto& a_shape = inputs[0]; - const auto& b_shape = inputs[1]; + const auto& a_shape = inputs[0]; + const auto& b_shape = inputs[1]; const auto& b1_shape = inputs[2]; const auto& c_shape = inputs.back(); - auto tuning_value = v.get("tuning_value", 4); + auto tuning_value = v.get("tuning_value", 4); if(not v.contains("tuning_value")) tuning_value = get_tuning_for({a_shape, b_shape, b1_shape, c_shape}); auto batch_count = get_batch_count(c_shape); From 4a066e44b56dc1ffd03447b9124aff58875615cb Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 27 Sep 2023 19:57:22 +0000 Subject: [PATCH 10/38] Address fuse_ck review comments --- src/targets/gpu/fuse_ck.cpp | 70 +++++++++++++++---------------------- 1 file changed, 28 insertions(+), 42 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index 309070ebe34..940a83da429 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -203,67 +203,53 @@ struct find_ck_gemm } }; -static bool is_mul_module(const module& m) +static auto is_mul_module(module& m) { - std::vector result; - for(auto& ins : m) - { - if(starts_with(ins.name(), "@")) - continue; - if(contains({"multibroadcast", "contiguous"}, ins.name())) - continue; - if(ins.name() == "pointwise") - { - return is_mul_module(*ins.module_inputs().front()); - } - else if(ins.name() == "mul") - { - return true; - } - } - return false; + auto is_mul = match::arg(0)(match::name("mul")(match::all_of[match::inputs()](match::name("@param")))); + return match_instruction(m, std::prev(m.end()), is_mul).result != m.end(); +} + +MIGRAPHX_PRED_MATCHER(is_pointwise_scale, instruction_ref ins) +{ + if (ins->name() != "pointwise") + return false; + if (ins->module_inputs().size() != 1) + return false; + return is_mul_module(*ins->module_inputs().front()); } struct find_ck_gemm_softmax_gemm { auto matcher() const { - auto gemm1 = - match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); - auto mul = match::name("pointwise")(match::any_of[match::inputs()](gemm1)).bind("scale"); - auto softmax = match::name("softmax")(match::any_of[match::inputs()](mul)).bind("softmax"); - return match::name("dot")(is_ck_gemm().bind("gemm2"))( - match::any_of[match::inputs()](softmax)); + auto gemm1 = match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); + auto mul = match::name("pointwise")(match::either_arg(0, 1)(match::is_constant().bind("scale"), gemm1))(is_pointwise_scale()); + auto softmax = match::name("softmax")(match::arg(0)(mul)).bind("softmax"); + + return match::name("dot")(is_ck_gemm().bind("gemm2"))(match::arg(0)(softmax)); } void apply(module_pass_manager& mpm, const match::matcher_result& r) const { + std::cout << "Matched GSG" << std::endl; auto ins = r.result; auto gemm2_ins = r.instructions["gemm2"]; auto gemm1_ins = r.instructions["gemm1"]; - auto scale_ins = r.instructions["scale"]; + auto scale_lit = r.instructions["scale"]; - if(scale_ins->module_inputs().size() != 1 or - not is_mul_module(*scale_ins->module_inputs().front())) - return; if(not ck_gemm_softmax_gemm::is_ck_supported_type(gemm1_ins->get_shape().type())) return; double scale = 1.0; - for(auto& in : scale_ins->inputs()) - { - if(in->can_eval()) - { - in->get_literal().visit([&](const auto s) { - if(std::all_of(s.begin() + 1, s.end(), [&](auto v) { - return float_equal(v, s.front()); - })) - scale = s.front(); - else - return; - }); - } - } + scale_lit->get_literal().visit([&](const auto s) { + // CK only supports single-valued scale + if(std::all_of(s.begin() + 1, s.end(), [&](auto v) { + return float_equal(v, s.front()); + })) + scale = s.front(); + else + return; + }); auto inputs = gemm1_ins->inputs(); // A, B inputs.push_back(gemm2_ins->inputs().back()); // B1 From 791addfb48bfb5292d68f8756b85002d01f50f8b Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 27 Sep 2023 19:59:12 +0000 Subject: [PATCH 11/38] Formatting --- src/targets/gpu/fuse_ck.cpp | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index 940a83da429..d42f6067c5b 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -205,15 +205,16 @@ struct find_ck_gemm static auto is_mul_module(module& m) { - auto is_mul = match::arg(0)(match::name("mul")(match::all_of[match::inputs()](match::name("@param")))); + auto is_mul = + match::arg(0)(match::name("mul")(match::all_of[match::inputs()](match::name("@param")))); return match_instruction(m, std::prev(m.end()), is_mul).result != m.end(); } MIGRAPHX_PRED_MATCHER(is_pointwise_scale, instruction_ref ins) { - if (ins->name() != "pointwise") + if(ins->name() != "pointwise") return false; - if (ins->module_inputs().size() != 1) + if(ins->module_inputs().size() != 1) return false; return is_mul_module(*ins->module_inputs().front()); } @@ -222,8 +223,10 @@ struct find_ck_gemm_softmax_gemm { auto matcher() const { - auto gemm1 = match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); - auto mul = match::name("pointwise")(match::either_arg(0, 1)(match::is_constant().bind("scale"), gemm1))(is_pointwise_scale()); + auto gemm1 = + match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); + auto mul = match::name("pointwise")(match::either_arg(0, 1)( + match::is_constant().bind("scale"), gemm1))(is_pointwise_scale()); auto softmax = match::name("softmax")(match::arg(0)(mul)).bind("softmax"); return match::name("dot")(is_ck_gemm().bind("gemm2"))(match::arg(0)(softmax)); @@ -243,9 +246,8 @@ struct find_ck_gemm_softmax_gemm double scale = 1.0; scale_lit->get_literal().visit([&](const auto s) { // CK only supports single-valued scale - if(std::all_of(s.begin() + 1, s.end(), [&](auto v) { - return float_equal(v, s.front()); - })) + if(std::all_of( + s.begin() + 1, s.end(), [&](auto v) { return float_equal(v, s.front()); })) scale = s.front(); else return; From 135eb63e5f19ef7a1eb42b2cd4c35f0ce69081d7 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 27 Sep 2023 20:08:28 +0000 Subject: [PATCH 12/38] Move common CK device functions to ck.hpp --- .../kernels/include/migraphx/kernels/ck.hpp | 23 +++++++++++++++++++ .../include/migraphx/kernels/ck_gemm.hpp | 11 --------- .../migraphx/kernels/ck_gemm_softmax_gemm.hpp | 23 ------------------- 3 files changed, 23 insertions(+), 34 deletions(-) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp index f8ba21d9570..bc41fc99a87 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp @@ -154,6 +154,29 @@ struct ck_add } }; +// In CK, the B matrix is ordered as N,K instead of K,N +template +constexpr auto ck_transposeb_dims(Dims dims) +{ + return unpack(dims, [](auto k, auto n) { return make_const_array(n, k); }); +} + +template +using ck_transposeb = decltype(make_shape(ck_transposeb_dims(get_shape_c{}.lens), + ck_transposeb_dims(get_shape_c{}.strides))); + +template +struct ck_gemm_softmax_gemm_settings +{ + T scale{}; +}; + +template +constexpr ck_gemm_softmax_gemm_settings make_ck_gemm_softmax_gemm_settings(Ts... xs) +{ + return {xs...}; +} + #ifdef MIGRAPHX_CK_CHECK #define MIGRAPHX_CK_STATIC_ASSERT static_assert #else diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp index fb032ca7e96..bc942029a29 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm.hpp @@ -33,17 +33,6 @@ namespace migraphx { -// In CK, the B matrix is ordered as N,K instead of K,N -template -constexpr auto ck_transposeb_dims(Dims dims) -{ - return unpack(dims, [](auto k, auto n) { return make_const_array(n, k); }); -} - -template -using ck_transposeb = decltype(make_shape(ck_transposeb_dims(get_shape_c{}.lens), - ck_transposeb_dims(get_shape_c{}.strides))); - template __device__ void ck_gemm_matrix(E e, A a, B b, Ds... ds) { diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp index 6a8acc503d6..4db58045bb0 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp @@ -33,29 +33,6 @@ namespace migraphx { -// In CK, the B matrix is ordered as N,K instead of K,N -template -constexpr auto ck_transposeb_dims(Dims dims) -{ - return unpack(dims, [](auto k, auto n) { return make_const_array(n, k); }); -} - -template -using ck_transposeb = decltype(make_shape(ck_transposeb_dims(get_shape_c{}.lens), - ck_transposeb_dims(get_shape_c{}.strides))); - -template -struct ck_gemm_softmax_gemm_settings -{ - T scale{}; -}; - -template -constexpr ck_gemm_softmax_gemm_settings make_ck_gemm_softmax_gemm_settings(Ts... xs) -{ - return {xs...}; -} - template __device__ void ck_gemm_softmax_gemm_matrix(C c, A a, B b, B1 b1, Settings s) { From 7e8b69ada5b681820caf6f9d9cf2d30e8cab68f4 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Thu, 28 Sep 2023 19:38:00 +0000 Subject: [PATCH 13/38] Update CK SHA --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 00268e058de..e37274873a6 100755 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,4 @@ ROCmSoftwarePlatform/half@rocm-5.6.0 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On -ROCmSoftwarePlatform/composable_kernel@11cab2d533ff155948812f6877d8123bdd53c338 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On +ROCmSoftwarePlatform/composable_kernel@4cd24e6639dd05d38523f1a8d990a1e3b3116ab6 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On From 7103d3ed85f17abfa98deecc64f4ec1a02458264 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Thu, 28 Sep 2023 19:58:37 +0000 Subject: [PATCH 14/38] Call eval on scale lit and move gsg settings out of ck.hpp --- src/targets/gpu/fuse_ck.cpp | 3 +-- .../gpu/kernels/include/migraphx/kernels/ck.hpp | 12 ------------ .../migraphx/kernels/ck_gemm_softmax_gemm.hpp | 12 ++++++++++++ 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index d42f6067c5b..735c3db2913 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -234,7 +234,6 @@ struct find_ck_gemm_softmax_gemm void apply(module_pass_manager& mpm, const match::matcher_result& r) const { - std::cout << "Matched GSG" << std::endl; auto ins = r.result; auto gemm2_ins = r.instructions["gemm2"]; auto gemm1_ins = r.instructions["gemm1"]; @@ -244,7 +243,7 @@ struct find_ck_gemm_softmax_gemm return; double scale = 1.0; - scale_lit->get_literal().visit([&](const auto s) { + scale_lit->eval().visit([&](const auto s) { // CK only supports single-valued scale if(std::all_of( s.begin() + 1, s.end(), [&](auto v) { return float_equal(v, s.front()); })) diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp index bc41fc99a87..370191155da 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck.hpp @@ -165,18 +165,6 @@ template using ck_transposeb = decltype(make_shape(ck_transposeb_dims(get_shape_c{}.lens), ck_transposeb_dims(get_shape_c{}.strides))); -template -struct ck_gemm_softmax_gemm_settings -{ - T scale{}; -}; - -template -constexpr ck_gemm_softmax_gemm_settings make_ck_gemm_softmax_gemm_settings(Ts... xs) -{ - return {xs...}; -} - #ifdef MIGRAPHX_CK_CHECK #define MIGRAPHX_CK_STATIC_ASSERT static_assert #else diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp index 4db58045bb0..41021d66c57 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp @@ -33,6 +33,18 @@ namespace migraphx { +template +struct ck_gemm_softmax_gemm_settings +{ + T scale{}; +}; + +template +constexpr ck_gemm_softmax_gemm_settings make_ck_gemm_softmax_gemm_settings(Ts... xs) +{ + return {xs...}; +} + template __device__ void ck_gemm_softmax_gemm_matrix(C c, A a, B b, B1 b1, Settings s) { From 6fe8b43c122bbf41c95f2d8c0b120347097accb7 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Fri, 29 Sep 2023 14:44:12 +0000 Subject: [PATCH 15/38] Update CK SHA --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index e37274873a6..0b3fb6925b4 100755 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,4 @@ ROCmSoftwarePlatform/half@rocm-5.6.0 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On -ROCmSoftwarePlatform/composable_kernel@4cd24e6639dd05d38523f1a8d990a1e3b3116ab6 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On +ROCmSoftwarePlatform/composable_kernel@0e97ebaa0ba18ec1d0247ebae6c45e0996563d0a -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On From 6d57f78db258f7abfa36ed068fce4e9a5805de5d Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Fri, 29 Sep 2023 15:38:52 +0000 Subject: [PATCH 16/38] Add name() to CK compute shape throws; enforce mul has 2 args --- src/targets/gpu/fuse_ck.cpp | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index 735c3db2913..02a64e40a6c 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -55,7 +55,7 @@ struct ck_gemm { check_shapes{inputs, *this}.same_ndims(); if(inputs.size() < 2) - MIGRAPHX_THROW("should have at least two inputs."); + MIGRAPHX_THROW(name() + ": should have at least two inputs."); auto a = inputs[0]; auto b = inputs[1]; for(const auto& input : inputs) @@ -96,7 +96,7 @@ struct ck_gemm_softmax_gemm { check_shapes{inputs, *this}.same_ndims(); if(inputs.size() < 3) - MIGRAPHX_THROW("Expected 3 inputs but got " + to_string(inputs.size())); + MIGRAPHX_THROW(name() + ": Expected 3 inputs but got " + to_string(inputs.size())); auto a = inputs[0]; auto b = inputs[1]; auto b1 = inputs[2]; @@ -225,7 +225,7 @@ struct find_ck_gemm_softmax_gemm { auto gemm1 = match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); - auto mul = match::name("pointwise")(match::either_arg(0, 1)( + auto mul = match::name("pointwise")(match::nargs(2), match::either_arg(0, 1)( match::is_constant().bind("scale"), gemm1))(is_pointwise_scale()); auto softmax = match::name("softmax")(match::arg(0)(mul)).bind("softmax"); From a9b32b71a6ffe2602282ae12098725f66693ae5f Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Fri, 29 Sep 2023 15:38:59 +0000 Subject: [PATCH 17/38] Formatting --- src/targets/gpu/fuse_ck.cpp | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index 02a64e40a6c..f3718db6590 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -225,8 +225,9 @@ struct find_ck_gemm_softmax_gemm { auto gemm1 = match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); - auto mul = match::name("pointwise")(match::nargs(2), match::either_arg(0, 1)( - match::is_constant().bind("scale"), gemm1))(is_pointwise_scale()); + auto mul = match::name("pointwise")( + match::nargs(2), match::either_arg(0, 1)(match::is_constant().bind("scale"), gemm1))( + is_pointwise_scale()); auto softmax = match::name("softmax")(match::arg(0)(mul)).bind("softmax"); return match::name("dot")(is_ck_gemm().bind("gemm2"))(match::arg(0)(softmax)); From c96139f82dd2391f0bac2839f1c3502e1a610376 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Tue, 3 Oct 2023 17:08:24 +0000 Subject: [PATCH 18/38] Move common functions to ck.hpp + other cleanup --- src/targets/gpu/fuse_ck.cpp | 6 +- src/targets/gpu/include/migraphx/gpu/ck.hpp | 182 ++++++++++++++ src/targets/gpu/jit/ck_gemm.cpp | 231 +----------------- src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp | 238 +------------------ test/verify/ck_gemm_softmax_gemm.cpp | 5 +- 5 files changed, 196 insertions(+), 466 deletions(-) create mode 100644 src/targets/gpu/include/migraphx/gpu/ck.hpp diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index f3718db6590..cf4d359d17d 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -76,7 +76,7 @@ MIGRAPHX_REGISTER_OP(ck_gemm); struct ck_gemm_softmax_gemm { operation op = make_op("dot"); - double scale = 1.0; + float scale = 1.0; template static auto reflect(Self& self, F f) @@ -203,7 +203,7 @@ struct find_ck_gemm } }; -static auto is_mul_module(module& m) +auto is_mul_module(module& m) { auto is_mul = match::arg(0)(match::name("mul")(match::all_of[match::inputs()](match::name("@param")))); @@ -243,7 +243,7 @@ struct find_ck_gemm_softmax_gemm if(not ck_gemm_softmax_gemm::is_ck_supported_type(gemm1_ins->get_shape().type())) return; - double scale = 1.0; + float scale = 1.0; scale_lit->eval().visit([&](const auto s) { // CK only supports single-valued scale if(std::all_of( diff --git a/src/targets/gpu/include/migraphx/gpu/ck.hpp b/src/targets/gpu/include/migraphx/gpu/ck.hpp new file mode 100644 index 00000000000..c6950bded56 --- /dev/null +++ b/src/targets/gpu/include/migraphx/gpu/ck.hpp @@ -0,0 +1,182 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef MIGRAPHX_GUARD_GPU_CK_HPP +#define MIGRAPHX_GUARD_GPU_CK_HPP + +#include +#include +#include + +#include "ck/host/device_gemm_multiple_d.hpp" +#include "ck/host/device_batched_gemm_softmax_gemm.hpp" + +namespace migraphx { +inline namespace MIGRAPHX_INLINE_NS { +namespace gpu { + +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK); + +// NOLINTNEXTLINE +const char* const disable_warning_pragma = R"__migraphx__( +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Weverything" +${content} +#pragma clang diagnostic pop +)__migraphx__"; + +template +std::string ck_disable_warnings(P p) +{ + return interpolate_string(disable_warning_pragma, + {{"content", std::string{p.first, p.second}}}); +} + +static std::unordered_map create_ck_header_strings() +{ + std::unordered_map result; + auto ck_headers = ck::host::GetHeaders(); + + std::transform( + ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto&& p) { + return std::make_pair(p.first, ck_disable_warnings(p.second)); + }); + return result; +} + +static std::vector create_ck_headers() +{ + static const auto& header_strings = create_ck_header_strings(); + std::vector srcs; + std::transform( + header_strings.begin(), header_strings.end(), std::back_inserter(srcs), [&](auto&& p) { + return src_file{fs::path{p.first}, + {p.second.data(), p.second.data() + p.second.size()}}; + }); + return srcs; +} + +static const std::vector& ck_headers() +{ + static const auto& headers = create_ck_headers(); + return headers; +} + +inline bool transposed_matrix(const shape& s) { return s.strides().back() != 1; } + +inline float matrix_distance(const shape& x, const shape& y) +{ + if(x.type() != y.type()) + return std::numeric_limits::max(); + if(transposed_matrix(x) != transposed_matrix(y)) + return std::numeric_limits::max(); + auto sum_squared = std::inner_product(x.lens().rbegin(), + x.lens().rbegin() + 2, + y.lens().rbegin(), + 0, + std::plus<>{}, + [](auto a, auto b) { return (a - b) * (a - b); }); + return std::sqrt(sum_squared); +} + +inline std::string get_layout(const shape& s) +{ + return transposed_matrix(s) ? "ck::tensor_layout::gemm::ColumnMajor" + : "ck::tensor_layout::gemm::RowMajor"; +} + +inline ck::host::DataType get_type(const shape& s) +{ + if(s.type() == shape::half_type) + return ck::host::DataType::Half; + else if(s.type() == shape::float_type) + return ck::host::DataType::Float; + else if(s.type() == shape::int8_type) + return ck::host::DataType::Int8; + else if(s.type() == shape::int32_type) + return ck::host::DataType::Int32; + MIGRAPHX_THROW("Unsupported ck type"); +} + +inline std::size_t get_batch_count(const shape& s) +{ + return std::accumulate( + s.lens().rbegin() + 2, s.lens().rend(), std::size_t{1}, std::multiplies()); +} + +inline void fold_batch_dims(shape& s) +{ + auto lens = s.lens(); + if(lens.size() <= 2) + return; + auto batch_count = get_batch_count(s); + auto m1 = lens.at(lens.size() - 2); + auto m2 = lens.at(lens.size() - 1); + if(transposed_matrix(s)) + s = shape{s.type(), {m1, m2 * batch_count}}; + else + s = shape{s.type(), {m1 * batch_count, m2}}; +} + +inline void remove_batch_dims(shape& s) +{ + auto lens = s.lens(); + if(lens.size() <= 2) + return; + auto m1 = lens.at(lens.size() - 2); + auto m2 = lens.at(lens.size() - 1); + s = shape{s.type(), {m1, m2}}; +} + +inline bool standard_batch(const shape& s) +{ + if(s.lens().size() < 3) + return true; + std::vector lens(s.lens().begin(), s.lens().end() - 2); + std::vector strides(s.strides().begin(), s.strides().end() - 2); + auto base = *(s.lens().end() - 2) * *(s.lens().end() - 1); + std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto stride) { + return stride / base; + }); + return shape{s.type(), lens, strides}.standard(); +} + +inline bool can_fold_batch(const std::vector& inputs) +{ + const auto& b_shape = inputs[1]; + if(std::any_of(inputs.begin() + 2, inputs.end() - 1, [](auto input) { + return not standard_batch(input); + })) + return false; + const auto& b_strides = b_shape.strides(); + return std::all_of( + b_strides.begin(), b_strides.end() - 2, [](auto stride) { return stride == 0; }); +} + +} // namespace gpu + +} // namespace MIGRAPHX_INLINE_NS +} // namespace migraphx +#endif // MIGRAPHX_GUARD_GPU_CK_HPP diff --git a/src/targets/gpu/jit/ck_gemm.cpp b/src/targets/gpu/jit/ck_gemm.cpp index 65bed54a800..dedc9dd0322 100644 --- a/src/targets/gpu/jit/ck_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm.cpp @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -37,8 +38,6 @@ #include #include -#include "ck/host/device_gemm_multiple_d.hpp" - namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { @@ -46,12 +45,6 @@ namespace gpu { using namespace migraphx::gpu::gen; // NOLINT -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING_VALUE); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK); - // NOLINTNEXTLINE static const char* const ck_gemm_kernel = R"__migraphx__( #include @@ -79,230 +72,18 @@ MIGRAPHX_GLOBAL void ${kernel}(${params}) )__migraphx__"; -// NOLINTNEXTLINE -static const char* const disable_warning_pragma = R"__migraphx__( -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Weverything" -${content} -#pragma clang diagnostic pop -)__migraphx__"; - -template -static std::string ck_disable_warnings(P p) -{ - return interpolate_string(disable_warning_pragma, - {{"content", std::string{p.first, p.second}}}); -} - -static std::unordered_map create_ck_header_strings() -{ - std::unordered_map result; - auto ck_headers = ck::host::GetHeaders(); - - std::transform( - ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto&& p) { - return std::make_pair(p.first, ck_disable_warnings(p.second)); - }); - return result; -} - -static std::vector create_ck_headers() -{ - static const auto& header_strings = create_ck_header_strings(); - std::vector srcs; - std::transform( - header_strings.begin(), header_strings.end(), std::back_inserter(srcs), [&](auto&& p) { - return src_file{fs::path{p.first}, - {p.second.data(), p.second.data() + p.second.size()}}; - }); - return srcs; -} - -static const std::vector& ck_headers() -{ - static const auto& headers = create_ck_headers(); - return headers; -} - -static bool transposed_matrix(const shape& s) { return s.strides().back() != 1; } - -using tuning_entry = std::pair, size_t>; -static std::vector read_tuning(const std::string& s) -{ - if(not fs::exists(s)) - return {}; - return from_value>(from_json_string(read_string(s))); -} - -static float matrix_distance(const shape& x, const shape& y) -{ - if(x.type() != y.type()) - return std::numeric_limits::max(); - if(transposed_matrix(x) != transposed_matrix(y)) - return std::numeric_limits::max(); - auto sum_squared = std::inner_product(x.lens().rbegin(), - x.lens().rbegin() + 2, - y.lens().rbegin(), - 0, - std::plus<>{}, - [](auto a, auto b) { return (a - b) * (a - b); }); - return std::sqrt(sum_squared); -} - -static std::size_t get_tuning_for(const std::vector& inputs) -{ - static auto tuning = read_tuning(string_value_of(MIGRAPHX_CK_TUNING{}, "")); - if(tuning.empty()) - { - std::cout << "*********** Warning: No CK tuning! for config:" << std::endl; - std::cout << " " << inputs[0] << std::endl; - std::cout << " " << inputs[1] << std::endl; - std::cout << " " << inputs[2] << std::endl; - } - auto it = std::find_if( - tuning.begin(), tuning.end(), [&](const auto& p) { return p.first == inputs; }); - if(it == tuning.end()) - { - std::cout << "*********** Warning: CK tuning missing for config!" << std::endl; - std::cout << " " << inputs[0] << std::endl; - std::cout << " " << inputs[1] << std::endl; - std::cout << " " << inputs[2] << std::endl; - std::vector> w; - std::transform(tuning.begin(), tuning.end(), std::back_inserter(w), [&](const auto& p) { - if(inputs.size() < 3 or p.first.size() < 3) - MIGRAPHX_THROW("Invalid CK config"); - auto avg_distance = std::inner_product( - p.first.begin(), - p.first.begin() + 3, - inputs.begin(), - 0.0f, - std::plus<>{}, - [](const auto& x, const auto& y) { return matrix_distance(x, y) / 3.0f; }); - return std::make_pair(avg_distance, p.second); - }); - std::sort(w.begin(), w.end()); - std::size_t default_value = 4; - if(not w.empty()) - default_value = w.front().second; - auto tuning_val = value_of(MIGRAPHX_CK_TUNING_VALUE{}, default_value); - std::cout << "*********** Warning: CK try tuning: " << tuning_val << std::endl; - return tuning_val; - } - return it->second; -} - struct ck_gemm_compiler : compiler { - static std::string get_layout(const shape& s) - { - return transposed_matrix(s) ? "ck::tensor_layout::gemm::ColumnMajor" - : "ck::tensor_layout::gemm::RowMajor"; - } - - static ck::host::DataType get_type(const shape& s) - { - if(s.type() == shape::half_type) - return ck::host::DataType::Half; - else if(s.type() == shape::float_type) - return ck::host::DataType::Float; - else if(s.type() == shape::int8_type) - return ck::host::DataType::Int8; - else if(s.type() == shape::int32_type) - return ck::host::DataType::Int32; - MIGRAPHX_THROW("Unsupported ck type"); - } - - template - static std::string ck_tuple(Iterator start, Iterator last, F f) - { - std::vector s; - std::transform(start, last, std::back_inserter(s), f); - return "ck::Tuple<" + join_strings(s, ",") + ">"; - } - - static std::vector adjust_inputs(std::vector inputs, bool& swap_inputs) - { - swap_inputs = false; - auto c_shape = inputs.back(); - if(not transposed_matrix(c_shape)) - return inputs; - std::vector perm(c_shape.lens().size()); - std::iota(perm.begin(), perm.end(), 0); - std::swap(perm[perm.size() - 1], perm[perm.size() - 2]); - std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](shape s) { - return reorder_shape(s, perm); - }); - swap_inputs = true; - return inputs; - } - - static std::size_t get_batch_count(const shape& s) - { - return std::accumulate( - s.lens().rbegin() + 2, s.lens().rend(), std::size_t{1}, std::multiplies()); - } - - static void fold_batch_dims(shape& s) - { - auto lens = s.lens(); - if(lens.size() <= 2) - return; - auto batch_count = get_batch_count(s); - auto m1 = lens.at(lens.size() - 2); - auto m2 = lens.at(lens.size() - 1); - if(transposed_matrix(s)) - s = shape{s.type(), {m1, m2 * batch_count}}; - else - s = shape{s.type(), {m1 * batch_count, m2}}; - } - - static void remove_batch_dims(shape& s) - { - auto lens = s.lens(); - if(lens.size() <= 2) - return; - auto m1 = lens.at(lens.size() - 2); - auto m2 = lens.at(lens.size() - 1); - s = shape{s.type(), {m1, m2}}; - } - std::vector names() const { return {"ck_gemm", "gpu::ck_gemm"}; } - static bool standard_batch(const shape& s) - { - if(s.lens().size() < 3) - return true; - std::vector lens(s.lens().begin(), s.lens().end() - 2); - std::vector strides(s.strides().begin(), s.strides().end() - 2); - auto base = *(s.lens().end() - 2) * *(s.lens().end() - 1); - std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto stride) { - return stride / base; - }); - return shape{s.type(), lens, strides}.standard(); - } - - bool can_fold_batch(const std::vector& inputs) const - { - const auto& b_shape = inputs[1]; - if(std::any_of(inputs.begin() + 2, inputs.end() - 1, [](auto input) { - return not standard_batch(input); - })) - return false; - const auto& b_strides = b_shape.strides(); - return std::all_of( - b_strides.begin(), b_strides.end() - 2, [](auto stride) { return stride == 0; }); - } - ck::host::device_gemm_multiple_d::Problem create_problem(const std::vector& inputs, const value& v) const { const auto& a_shape = inputs[0]; const auto& b_shape = inputs[1]; const auto& c_shape = inputs.back(); - - // cppcheck-suppress unreadVariable - auto rank = a_shape.ndim(); - + + auto rank = a_shape.ndim(); auto batch_count = get_batch_count(c_shape); auto m = c_shape.lens()[rank - 2]; m = can_fold_batch(inputs) ? m * batch_count : m; @@ -352,12 +133,8 @@ struct ck_gemm_compiler : compiler operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - const auto& a_shape = inputs[0]; - const auto& b_shape = inputs[1]; const auto& c_shape = inputs.back(); - auto tuning_value = v.get("tuning_value", 4); - if(not v.contains("tuning_value")) - tuning_value = get_tuning_for({a_shape, b_shape, c_shape}); + auto tuning_value = v.get("tuning_value", 0); auto batch_count = get_batch_count(c_shape); auto problem = create_problem(inputs, v); diff --git a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp index 9bae04c7e3e..2c30e262219 100644 --- a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp @@ -29,6 +29,7 @@ #include #include +#include #include #include #include @@ -37,8 +38,6 @@ #include #include -#include "ck/host/device_batched_gemm_softmax_gemm.hpp" - namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { @@ -46,12 +45,6 @@ namespace gpu { using namespace migraphx::gpu::gen; // NOLINT -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_TUNING_VALUE); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK); - // NOLINTNEXTLINE static const char* const ck_gemm_softmax_gemm_kernel = R"__migraphx__( #include @@ -82,236 +75,22 @@ MIGRAPHX_GLOBAL void ${kernel}(${params}) )__migraphx__"; -// NOLINTNEXTLINE -static const char* const disable_warning_pragma = R"__migraphx__( -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Weverything" -${content} -#pragma clang diagnostic pop -)__migraphx__"; - -template -static std::string ck_disable_warnings(P p) -{ - return interpolate_string(disable_warning_pragma, - {{"content", std::string{p.first, p.second}}}); -} - -static std::unordered_map create_ck_header_strings() -{ - std::unordered_map result; - auto ck_headers = ck::host::GetHeaders(); - - std::transform( - ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto&& p) { - return std::make_pair(p.first, ck_disable_warnings(p.second)); - }); - return result; -} - -static std::vector create_ck_headers() -{ - static const auto& header_strings = create_ck_header_strings(); - std::vector srcs; - std::transform( - header_strings.begin(), header_strings.end(), std::back_inserter(srcs), [&](auto&& p) { - return src_file{fs::path{p.first}, - {p.second.data(), p.second.data() + p.second.size()}}; - }); - return srcs; -} - -static const std::vector& ck_headers() -{ - static const auto& headers = create_ck_headers(); - return headers; -} - -static bool transposed_matrix(const shape& s) { return s.strides().back() != 1; } - -using tuning_entry = std::pair, size_t>; -static std::vector read_tuning(const std::string& s) -{ - if(not fs::exists(s)) - return {}; - return from_value>(from_json_string(read_string(s))); -} - -static float matrix_distance(const shape& x, const shape& y) -{ - if(x.type() != y.type()) - return std::numeric_limits::max(); - if(transposed_matrix(x) != transposed_matrix(y)) - return std::numeric_limits::max(); - auto sum_squared = std::inner_product(x.lens().rbegin(), - x.lens().rbegin() + 2, - y.lens().rbegin(), - 0, - std::plus<>{}, - [](auto a, auto b) { return (a - b) * (a - b); }); - return std::sqrt(sum_squared); -} - -static std::size_t get_tuning_for(const std::vector& inputs) -{ - static auto tuning = read_tuning(string_value_of(MIGRAPHX_CK_TUNING{}, "")); - if(tuning.empty()) - { - std::cout << "*********** Warning: No CK tuning! for config:" << std::endl; - std::cout << " " << inputs[0] << std::endl; - std::cout << " " << inputs[1] << std::endl; - std::cout << " " << inputs[2] << std::endl; - std::cout << " " << inputs[3] << std::endl; - } - auto it = std::find_if( - tuning.begin(), tuning.end(), [&](const auto& p) { return p.first == inputs; }); - if(it == tuning.end()) - { - std::cout << "*********** Warning: CK tuning missing for config!" << std::endl; - std::cout << " " << inputs[0] << std::endl; - std::cout << " " << inputs[1] << std::endl; - std::cout << " " << inputs[2] << std::endl; - std::cout << " " << inputs[3] << std::endl; - std::vector> w; - std::transform(tuning.begin(), tuning.end(), std::back_inserter(w), [&](const auto& p) { - if(inputs.size() < 3 or p.first.size() < 3) - MIGRAPHX_THROW("Invalid CK config"); - auto avg_distance = std::inner_product( - p.first.begin(), - p.first.begin() + 3, - inputs.begin(), - 0.0f, - std::plus<>{}, - [](const auto& x, const auto& y) { return matrix_distance(x, y) / 3.0f; }); - return std::make_pair(avg_distance, p.second); - }); - std::sort(w.begin(), w.end()); - std::size_t default_value = 5; - if(not w.empty()) - default_value = w.front().second; - auto tuning_val = value_of(MIGRAPHX_CK_TUNING_VALUE{}, default_value); - std::cout << "*********** Warning: CK try tuning: " << tuning_val << std::endl; - return tuning_val; - } - return it->second; -} - struct ck_gemm_softmax_gemm_compiler : compiler { - static std::string get_layout(const shape& s) - { - return transposed_matrix(s) ? "ck::tensor_layout::gemm::ColumnMajor" - : "ck::tensor_layout::gemm::RowMajor"; - } - - static ck::host::DataType get_type(const shape& s) - { - if(s.type() == shape::half_type) - return ck::host::DataType::Half; - else if(s.type() == shape::float_type) - return ck::host::DataType::Float; - else if(s.type() == shape::int8_type) - return ck::host::DataType::Int8; - else if(s.type() == shape::int32_type) - return ck::host::DataType::Int32; - MIGRAPHX_THROW("Unsupported ck type"); - } - - template - static std::string ck_tuple(Iterator start, Iterator last, F f) - { - std::vector s; - std::transform(start, last, std::back_inserter(s), f); - return "ck::Tuple<" + join_strings(s, ",") + ">"; - } - - static std::vector adjust_inputs(std::vector inputs, bool& swap_inputs) - { - swap_inputs = false; - auto c_shape = inputs.back(); - if(not transposed_matrix(c_shape)) - return inputs; - std::vector perm(c_shape.lens().size()); - std::iota(perm.begin(), perm.end(), 0); - std::swap(perm[perm.size() - 1], perm[perm.size() - 2]); - std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](shape s) { - return reorder_shape(s, perm); - }); - swap_inputs = true; - return inputs; - } - - static std::size_t get_batch_count(const shape& s) - { - return std::accumulate( - s.lens().rbegin() + 2, s.lens().rend(), std::size_t{1}, std::multiplies()); - } - - static void fold_batch_dims(shape& s) - { - auto lens = s.lens(); - if(lens.size() <= 2) - return; - auto batch_count = get_batch_count(s); - auto m1 = lens.at(lens.size() - 2); - auto m2 = lens.at(lens.size() - 1); - if(transposed_matrix(s)) - s = shape{s.type(), {m1, m2 * batch_count}}; - else - s = shape{s.type(), {m1 * batch_count, m2}}; - } - - static void remove_batch_dims(shape& s) - { - auto lens = s.lens(); - if(lens.size() <= 2) - return; - auto m1 = lens.at(lens.size() - 2); - auto m2 = lens.at(lens.size() - 1); - s = shape{s.type(), {m1, m2}}; - } - std::vector names() const { return {"ck_gemm_softmax_gemm", "gpu::ck_gemm_softmax_gemm"}; } - static bool standard_batch(const shape& s) - { - if(s.lens().size() < 3) - return true; - std::vector lens(s.lens().begin(), s.lens().end() - 2); - std::vector strides(s.strides().begin(), s.strides().end() - 2); - auto base = *(s.lens().end() - 2) * *(s.lens().end() - 1); - std::transform(strides.begin(), strides.end(), strides.begin(), [&](auto stride) { - return stride / base; - }); - return shape{s.type(), lens, strides}.standard(); - } - - bool can_fold_batch(const std::vector& inputs) const - { - const auto& b_shape = inputs[1]; - if(std::any_of(inputs.begin() + 2, inputs.end() - 1, [](auto input) { - return not standard_batch(input); - })) - return false; - const auto& b_strides = b_shape.strides(); - return std::all_of( - b_strides.begin(), b_strides.end() - 2, [](auto stride) { return stride == 0; }); - } - ck::host::device_batched_gemm_softmax_gemm::Problem - create_problem(const std::vector& inputs, const value& v) const + create_problem(const std::vector& inputs, const value&) const { const auto& a_shape = inputs[0]; const auto& b_shape = inputs[1]; const auto& b1_shape = inputs[2]; const auto& c_shape = inputs.back(); - - // cppcheck-suppress unreadVariable - auto rank = a_shape.ndim(); - + + auto rank = a_shape.ndim(); auto batch_count = get_batch_count(c_shape); auto m = c_shape.lens()[rank - 2]; m = can_fold_batch(inputs) ? m * batch_count : m; @@ -349,13 +128,8 @@ struct ck_gemm_softmax_gemm_compiler : compiler operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - const auto& a_shape = inputs[0]; - const auto& b_shape = inputs[1]; - const auto& b1_shape = inputs[2]; const auto& c_shape = inputs.back(); - auto tuning_value = v.get("tuning_value", 4); - if(not v.contains("tuning_value")) - tuning_value = get_tuning_for({a_shape, b_shape, b1_shape, c_shape}); + auto tuning_value = v.get("tuning_value", 5); auto batch_count = get_batch_count(c_shape); auto problem = create_problem(inputs, v); @@ -399,7 +173,7 @@ struct ck_gemm_softmax_gemm_compiler : compiler {"blocks_per_batch", to_string(blocks_per_batch)}, {"preamble", v.get("preamble", std::string{})}, {"kernel", options.kernel_name}}); - + return compile_hip_code_object(src, options); } diff --git a/test/verify/ck_gemm_softmax_gemm.cpp b/test/verify/ck_gemm_softmax_gemm.cpp index 1759e36f4bd..84c309cb734 100644 --- a/test/verify/ck_gemm_softmax_gemm.cpp +++ b/test/verify/ck_gemm_softmax_gemm.cpp @@ -35,17 +35,14 @@ struct ck_gemm_softmax_gemm : verify_program auto* mm = p.get_main_module(); migraphx::shape m1_shape{migraphx::shape::half_type, {1, 12, 256, 256}}; migraphx::shape m2_shape{migraphx::shape::half_type, {1, 12, 256, 256}}; - auto m2_elements = 1 * 12 * 256 * 256; + auto m2_elements = m2_shape.elements(); auto a = mm->add_parameter("1", m1_shape); auto b = mm->add_parameter("2", m1_shape); auto b1 = mm->add_parameter("3", m1_shape); - auto c = mm->add_parameter("4", m1_shape); std::vector eights(m2_elements, 0.125); auto eight = mm->add_literal(migraphx::literal{m2_shape, eights}); std::vector zeros(m2_elements, 0); auto zero = mm->add_literal(migraphx::literal{m2_shape, zeros}); - std::vector ones(m2_elements, 1); - auto one = mm->add_literal(migraphx::literal{m2_shape, ones}); b = mm->add_instruction(migraphx::make_op("transpose", {{"permutation", {0, 1, 3, 2}}}), b); auto gemm1 = mm->add_instruction(migraphx::make_op("dot"), a, b); From 0b6b490e9d635131a7dddd4aac91aab870b0b78a Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Tue, 3 Oct 2023 17:08:32 +0000 Subject: [PATCH 19/38] Formatting --- src/targets/gpu/fuse_ck.cpp | 2 +- src/targets/gpu/include/migraphx/gpu/ck.hpp | 4 ++-- src/targets/gpu/jit/ck_gemm.cpp | 2 +- src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index cf4d359d17d..b769c0d3a51 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -76,7 +76,7 @@ MIGRAPHX_REGISTER_OP(ck_gemm); struct ck_gemm_softmax_gemm { operation op = make_op("dot"); - float scale = 1.0; + float scale = 1.0; template static auto reflect(Self& self, F f) diff --git a/src/targets/gpu/include/migraphx/gpu/ck.hpp b/src/targets/gpu/include/migraphx/gpu/ck.hpp index c6950bded56..b70bf107792 100644 --- a/src/targets/gpu/include/migraphx/gpu/ck.hpp +++ b/src/targets/gpu/include/migraphx/gpu/ck.hpp @@ -167,8 +167,8 @@ inline bool can_fold_batch(const std::vector& inputs) { const auto& b_shape = inputs[1]; if(std::any_of(inputs.begin() + 2, inputs.end() - 1, [](auto input) { - return not standard_batch(input); - })) + return not standard_batch(input); + })) return false; const auto& b_strides = b_shape.strides(); return std::all_of( diff --git a/src/targets/gpu/jit/ck_gemm.cpp b/src/targets/gpu/jit/ck_gemm.cpp index dedc9dd0322..523a21fba0b 100644 --- a/src/targets/gpu/jit/ck_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm.cpp @@ -82,7 +82,7 @@ struct ck_gemm_compiler : compiler const auto& a_shape = inputs[0]; const auto& b_shape = inputs[1]; const auto& c_shape = inputs.back(); - + auto rank = a_shape.ndim(); auto batch_count = get_batch_count(c_shape); auto m = c_shape.lens()[rank - 2]; diff --git a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp index 2c30e262219..c9bab757f40 100644 --- a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp @@ -89,7 +89,7 @@ struct ck_gemm_softmax_gemm_compiler : compiler const auto& b_shape = inputs[1]; const auto& b1_shape = inputs[2]; const auto& c_shape = inputs.back(); - + auto rank = a_shape.ndim(); auto batch_count = get_batch_count(c_shape); auto m = c_shape.lens()[rank - 2]; @@ -173,7 +173,7 @@ struct ck_gemm_softmax_gemm_compiler : compiler {"blocks_per_batch", to_string(blocks_per_batch)}, {"preamble", v.get("preamble", std::string{})}, {"kernel", options.kernel_name}}); - + return compile_hip_code_object(src, options); } From 370b2ccea2a1cc50956d41d5d5a0a10dd1bdb764 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Tue, 3 Oct 2023 17:24:17 +0000 Subject: [PATCH 20/38] Update CK SHA --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 0b3fb6925b4..5476de9ba18 100755 --- a/requirements.txt +++ b/requirements.txt @@ -28,4 +28,4 @@ ROCmSoftwarePlatform/half@rocm-5.6.0 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On -ROCmSoftwarePlatform/composable_kernel@0e97ebaa0ba18ec1d0247ebae6c45e0996563d0a -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On +ROCmSoftwarePlatform/composable_kernel@4b0b327b81668978249fd9b6eb1c35214e7d78ea -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On From a7d57049c56fbd18dfce468d7554a1a0e7ef8501 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 4 Oct 2023 15:07:21 +0000 Subject: [PATCH 21/38] Move gemm_softmax_gemm matching to prefuse_ops --- src/include/migraphx/ck.hpp | 81 +++++++++++++++++++ src/targets/gpu/fuse_ck.cpp | 89 ++------------------- src/targets/gpu/include/migraphx/gpu/ck.hpp | 29 +------ src/targets/gpu/prefuse_ops.cpp | 60 +++++++++++++- src/targets/gpu/target.cpp | 4 +- 5 files changed, 150 insertions(+), 113 deletions(-) create mode 100644 src/include/migraphx/ck.hpp diff --git a/src/include/migraphx/ck.hpp b/src/include/migraphx/ck.hpp new file mode 100644 index 00000000000..2e4229c6de1 --- /dev/null +++ b/src/include/migraphx/ck.hpp @@ -0,0 +1,81 @@ +/* + * The MIT License (MIT) + * + * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#ifndef MIGRAPHX_GUARD_CK_HPP +#define MIGRAPHX_GUARD_CK_HPP + +#include +#include +#include + +namespace migraphx { +inline namespace MIGRAPHX_INLINE_NS { + +#ifndef _WIN32 +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK); +#endif + +struct gemm_softmax_gemm +{ + operation op = make_op("dot"); + float scale = 1.0; + + template + static auto reflect(Self& self, F f) + { + return pack(f(self.op, "op"), f(self.scale, "scale")); + } + + std::string name() const { return "pre_gemm_softmax_gemm"; } + + void check_gemm_shape(const shape& s) const + { + if(not contains(range(s.strides().rbegin(), s.strides().rbegin() + 3), 1)) + MIGRAPHX_THROW("Invalid shape for ck_gemm_softmax_gemm"); + } + + shape compute_shape(std::vector inputs, const std::vector&) const + { + check_shapes{inputs, *this}.same_ndims(); + if(inputs.size() < 3) + MIGRAPHX_THROW(name() + ": Expected 3 inputs but got " + to_string(inputs.size())); + auto a = inputs[0]; + auto b = inputs[1]; + auto b1 = inputs[2]; + for(const auto& input : inputs) + { + check_gemm_shape(input); + } + return op.compute_shape({op.compute_shape({a, b}), b1}); + } + + static bool is_ck_supported_type(shape::type_t t) { return contains({shape::half_type}, t); } +}; + +} // namespace MIGRAPHX_INLINE_NS +} // namespace migraphx + +#endif diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index b769c0d3a51..ab3db9736bc 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -24,8 +24,8 @@ #include #include #include -#include #include +#include namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { @@ -73,41 +73,9 @@ struct ck_gemm }; MIGRAPHX_REGISTER_OP(ck_gemm); -struct ck_gemm_softmax_gemm +struct ck_gemm_softmax_gemm : gemm_softmax_gemm { - operation op = make_op("dot"); - float scale = 1.0; - - template - static auto reflect(Self& self, F f) - { - return pack(f(self.op, "op"), f(self.scale, "scale")); - } - std::string name() const { return "gpu::ck_gemm_softmax_gemm"; } - - void check_gemm_shape(const shape& s) const - { - if(not contains(range(s.strides().rbegin(), s.strides().rbegin() + 3), 1)) - MIGRAPHX_THROW("Invalid shape for ck_gemm_softmax_gemm"); - } - - shape compute_shape(std::vector inputs, const std::vector&) const - { - check_shapes{inputs, *this}.same_ndims(); - if(inputs.size() < 3) - MIGRAPHX_THROW(name() + ": Expected 3 inputs but got " + to_string(inputs.size())); - auto a = inputs[0]; - auto b = inputs[1]; - auto b1 = inputs[2]; - for(const auto& input : inputs) - { - check_gemm_shape(input); - } - return op.compute_shape({op.compute_shape({a, b}), b1}); - } - - static bool is_ck_supported_type(shape::type_t t) { return contains({shape::half_type}, t); } }; MIGRAPHX_REGISTER_OP(ck_gemm_softmax_gemm); @@ -203,61 +171,21 @@ struct find_ck_gemm } }; -auto is_mul_module(module& m) -{ - auto is_mul = - match::arg(0)(match::name("mul")(match::all_of[match::inputs()](match::name("@param")))); - return match_instruction(m, std::prev(m.end()), is_mul).result != m.end(); -} - -MIGRAPHX_PRED_MATCHER(is_pointwise_scale, instruction_ref ins) -{ - if(ins->name() != "pointwise") - return false; - if(ins->module_inputs().size() != 1) - return false; - return is_mul_module(*ins->module_inputs().front()); -} - struct find_ck_gemm_softmax_gemm { auto matcher() const { - auto gemm1 = - match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); - auto mul = match::name("pointwise")( - match::nargs(2), match::either_arg(0, 1)(match::is_constant().bind("scale"), gemm1))( - is_pointwise_scale()); - auto softmax = match::name("softmax")(match::arg(0)(mul)).bind("softmax"); - - return match::name("dot")(is_ck_gemm().bind("gemm2"))(match::arg(0)(softmax)); + return match::name("gpu::pre_gemm_softmax_gemm"); } void apply(module_pass_manager& mpm, const match::matcher_result& r) const { auto ins = r.result; - auto gemm2_ins = r.instructions["gemm2"]; - auto gemm1_ins = r.instructions["gemm1"]; - auto scale_lit = r.instructions["scale"]; - - if(not ck_gemm_softmax_gemm::is_ck_supported_type(gemm1_ins->get_shape().type())) - return; - - float scale = 1.0; - scale_lit->eval().visit([&](const auto s) { - // CK only supports single-valued scale - if(std::all_of( - s.begin() + 1, s.end(), [&](auto v) { return float_equal(v, s.front()); })) - scale = s.front(); - else - return; - }); - - auto inputs = gemm1_ins->inputs(); // A, B - inputs.push_back(gemm2_ins->inputs().back()); // B1 - + auto v = ins->get_operator().to_value(); + assert(v.contains("scale")); + auto scale = v.at("scale").to(); mpm.get_module().replace_instruction( - ins, ck_gemm_softmax_gemm{gemm2_ins->get_operator(), scale}, inputs); + ins, ck_gemm_softmax_gemm{migraphx::make_op("dot"), scale}, ins->inputs()); } }; @@ -265,8 +193,7 @@ struct find_ck_gemm_softmax_gemm void fuse_ck::apply(module_pass_manager& mpm) const { - match::find_matches(mpm, find_ck_gemm_softmax_gemm{}); - match::find_matches(mpm, find_ck_gemm_pointwise{}); + match::find_matches(mpm, find_ck_gemm_softmax_gemm{}, find_ck_gemm_pointwise{}); match::find_matches(mpm, find_ck_gemm{}); } diff --git a/src/targets/gpu/include/migraphx/gpu/ck.hpp b/src/targets/gpu/include/migraphx/gpu/ck.hpp index b70bf107792..8ca9c49e5d9 100644 --- a/src/targets/gpu/include/migraphx/gpu/ck.hpp +++ b/src/targets/gpu/include/migraphx/gpu/ck.hpp @@ -24,7 +24,7 @@ #ifndef MIGRAPHX_GUARD_GPU_CK_HPP #define MIGRAPHX_GUARD_GPU_CK_HPP -#include +#include #include #include @@ -35,10 +35,6 @@ namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { namespace gpu { -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK); - // NOLINTNEXTLINE const char* const disable_warning_pragma = R"__migraphx__( #pragma clang diagnostic push @@ -78,7 +74,7 @@ static std::vector create_ck_headers() return srcs; } -static const std::vector& ck_headers() +static inline const std::vector& ck_headers() { static const auto& headers = create_ck_headers(); return headers; @@ -86,27 +82,6 @@ static const std::vector& ck_headers() inline bool transposed_matrix(const shape& s) { return s.strides().back() != 1; } -inline float matrix_distance(const shape& x, const shape& y) -{ - if(x.type() != y.type()) - return std::numeric_limits::max(); - if(transposed_matrix(x) != transposed_matrix(y)) - return std::numeric_limits::max(); - auto sum_squared = std::inner_product(x.lens().rbegin(), - x.lens().rbegin() + 2, - y.lens().rbegin(), - 0, - std::plus<>{}, - [](auto a, auto b) { return (a - b) * (a - b); }); - return std::sqrt(sum_squared); -} - -inline std::string get_layout(const shape& s) -{ - return transposed_matrix(s) ? "ck::tensor_layout::gemm::ColumnMajor" - : "ck::tensor_layout::gemm::RowMajor"; -} - inline ck::host::DataType get_type(const shape& s) { if(s.type() == shape::half_type) diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index 0c93c6c67db..b7cf40964e7 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -24,15 +24,15 @@ #include #include #include -#include -#include #include #include #include +#include namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { namespace gpu { + namespace { template @@ -120,6 +120,60 @@ struct find_add_layernorm m.replace_instruction(ins, add_layernorm{op.epsilon}, add_ins->inputs()); } }; + +struct pre_gemm_softmax_gemm : gemm_softmax_gemm +{ + std::string name() const { return "gpu::pre_gemm_softmax_gemm"; } +}; +MIGRAPHX_REGISTER_OP(pre_gemm_softmax_gemm); + +MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins) +{ + if(ins->name() != "dot") + return false; + if(not pre_gemm_softmax_gemm::is_ck_supported_type(ins->get_shape().type())) + return false; + return true; +} + +struct find_gemm_softmax_gemm +{ + auto matcher() const + { + auto gemm1 = + match::skip(match::name("contiguous"))(match::name("dot")(is_ck_gemm().bind("gemm1"))); + auto mul = match::name("mul")( + match::nargs(2), match::either_arg(0, 1)(match::is_constant().bind("scale"), gemm1)); + auto softmax = match::name("softmax")(match::arg(0)(mul)).bind("softmax"); + + return match::name("dot")(is_ck_gemm().bind("gemm2"))(match::arg(0)(softmax)); + } + + void apply(module_pass_manager& mpm, const match::matcher_result& r) const + { + auto ins = r.result; + auto gemm2_ins = r.instructions["gemm2"]; + auto gemm1_ins = r.instructions["gemm1"]; + auto scale_lit = r.instructions["scale"]; + + float scale = 1.0; + scale_lit->eval().visit([&](const auto s) { + // CK only supports single-valued scale + if(std::all_of( + s.begin() + 1, s.end(), [&](auto v) { return float_equal(v, s.front()); })) + scale = s.front(); + else + return; + }); + + auto inputs = gemm1_ins->inputs(); // A, B + inputs.push_back(gemm2_ins->inputs().back()); // B1 + + mpm.get_module().replace_instruction( + ins, pre_gemm_softmax_gemm{gemm2_ins->get_operator(), scale}, inputs); + } +}; + } // namespace void prefuse_ops::apply(module_pass_manager& mpm) const @@ -127,6 +181,8 @@ void prefuse_ops::apply(module_pass_manager& mpm) const match::find_matches(mpm.get_module(), find_layernorm{}); mpm.run_pass(dead_code_elimination{}); match::find_matches(mpm.get_module(), find_add_layernorm{}); + if (enabled(MIGRAPHX_ENABLE_CK{})) + match::find_matches(mpm, find_gemm_softmax_gemm{}); } } // namespace gpu diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp index faefd3ab155..cb94d8759fb 100644 --- a/src/targets/gpu/target.cpp +++ b/src/targets/gpu/target.cpp @@ -53,6 +53,7 @@ #include #include #include +#include #include #include #include @@ -76,9 +77,6 @@ namespace gpu { MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS) MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION) MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC) -#ifndef _WIN32 -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK) -#endif struct id_pass { From 4acc55c3385f779972b8795103e1760fbe9396de Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 4 Oct 2023 15:07:26 +0000 Subject: [PATCH 22/38] Formatting --- src/targets/gpu/fuse_ck.cpp | 7 ++----- src/targets/gpu/prefuse_ops.cpp | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index ab3db9736bc..b8955710cc0 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -173,15 +173,12 @@ struct find_ck_gemm struct find_ck_gemm_softmax_gemm { - auto matcher() const - { - return match::name("gpu::pre_gemm_softmax_gemm"); - } + auto matcher() const { return match::name("gpu::pre_gemm_softmax_gemm"); } void apply(module_pass_manager& mpm, const match::matcher_result& r) const { auto ins = r.result; - auto v = ins->get_operator().to_value(); + auto v = ins->get_operator().to_value(); assert(v.contains("scale")); auto scale = v.at("scale").to(); mpm.get_module().replace_instruction( diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index b7cf40964e7..6114db7383d 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -181,7 +181,7 @@ void prefuse_ops::apply(module_pass_manager& mpm) const match::find_matches(mpm.get_module(), find_layernorm{}); mpm.run_pass(dead_code_elimination{}); match::find_matches(mpm.get_module(), find_add_layernorm{}); - if (enabled(MIGRAPHX_ENABLE_CK{})) + if(enabled(MIGRAPHX_ENABLE_CK{})) match::find_matches(mpm, find_gemm_softmax_gemm{}); } From f19e41bb012d1bbaa6c18c4568f9d4e7458ec668 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 4 Oct 2023 15:22:51 +0000 Subject: [PATCH 23/38] Fix cppcheck + other cleanup --- src/include/migraphx/ck.hpp | 4 ++-- src/targets/gpu/jit/ck_gemm.cpp | 1 + src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp | 1 + .../migraphx/kernels/ck_gemm_softmax_gemm.hpp | 4 ++-- test/onnx/gemm_softmax_gemm_test.onnx | Bin 340 -> 0 bytes 5 files changed, 6 insertions(+), 4 deletions(-) delete mode 100644 test/onnx/gemm_softmax_gemm_test.onnx diff --git a/src/include/migraphx/ck.hpp b/src/include/migraphx/ck.hpp index 2e4229c6de1..17804ba4b4e 100644 --- a/src/include/migraphx/ck.hpp +++ b/src/include/migraphx/ck.hpp @@ -49,12 +49,12 @@ struct gemm_softmax_gemm return pack(f(self.op, "op"), f(self.scale, "scale")); } - std::string name() const { return "pre_gemm_softmax_gemm"; } + std::string name() const { return "gemm_softmax_gemm"; } void check_gemm_shape(const shape& s) const { if(not contains(range(s.strides().rbegin(), s.strides().rbegin() + 3), 1)) - MIGRAPHX_THROW("Invalid shape for ck_gemm_softmax_gemm"); + MIGRAPHX_THROW("Invalid shape for " + name()); } shape compute_shape(std::vector inputs, const std::vector&) const diff --git a/src/targets/gpu/jit/ck_gemm.cpp b/src/targets/gpu/jit/ck_gemm.cpp index 523a21fba0b..d23ff4fe978 100644 --- a/src/targets/gpu/jit/ck_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm.cpp @@ -83,6 +83,7 @@ struct ck_gemm_compiler : compiler const auto& b_shape = inputs[1]; const auto& c_shape = inputs.back(); + // cppcheck-suppress unreadVariable auto rank = a_shape.ndim(); auto batch_count = get_batch_count(c_shape); auto m = c_shape.lens()[rank - 2]; diff --git a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp index c9bab757f40..dca8151872e 100644 --- a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp @@ -90,6 +90,7 @@ struct ck_gemm_softmax_gemm_compiler : compiler const auto& b1_shape = inputs[2]; const auto& c_shape = inputs.back(); + // cppcheck-suppress unreadVariable auto rank = a_shape.ndim(); auto batch_count = get_batch_count(c_shape); auto m = c_shape.lens()[rank - 2]; diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp index 41021d66c57..80d4f69f549 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/ck_gemm_softmax_gemm.hpp @@ -21,8 +21,8 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#ifndef MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP -#define MIGRAPHX_GUARD_KERNELS_CK_GEMM_HPP +#ifndef MIGRAPHX_GUARD_KERNELS_CK_GEMM_SOFTMAX_GEMM_HPP +#define MIGRAPHX_GUARD_KERNELS_CK_GEMM_SOFTMAX_GEMM_HPP #include #include diff --git a/test/onnx/gemm_softmax_gemm_test.onnx b/test/onnx/gemm_softmax_gemm_test.onnx deleted file mode 100644 index 7c290563a2d54eca272913cdc2209f665aa28493..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 340 zcmZ{fyAFat5Jla^$9OeL)<B}odf)e@_VXA!d?9+-MaGe!B#%qWGRg)u zKv&&mCP;?iM>vKdwy1#^RRQM6;8fONjV1?IuCN*v|Jog!7%>|TV_w2 e{hOFzOb`T4%#4BR2hNMTEmbXGO4_o=?db~>Qcvvw From 29aae08283a8435f09bfac494e15d5b986b44851 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 4 Oct 2023 16:29:38 +0000 Subject: [PATCH 24/38] Formatting --- src/targets/gpu/fuse_ck.cpp | 4 ++-- src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp | 8 ++++---- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index b8955710cc0..1d51b8f34f3 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -177,8 +177,8 @@ struct find_ck_gemm_softmax_gemm void apply(module_pass_manager& mpm, const match::matcher_result& r) const { - auto ins = r.result; - auto v = ins->get_operator().to_value(); + auto ins = r.result; + auto v = ins->get_operator().to_value(); assert(v.contains("scale")); auto scale = v.at("scale").to(); mpm.get_module().replace_instruction( diff --git a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp index dca8151872e..4176ed04e1d 100644 --- a/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm_softmax_gemm.cpp @@ -129,10 +129,10 @@ struct ck_gemm_softmax_gemm_compiler : compiler operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { - const auto& c_shape = inputs.back(); - auto tuning_value = v.get("tuning_value", 5); - auto batch_count = get_batch_count(c_shape); - auto problem = create_problem(inputs, v); + const auto& c_shape = inputs.back(); + auto tuning_value = v.get("tuning_value", 5); + auto batch_count = get_batch_count(c_shape); + auto problem = create_problem(inputs, v); const auto include_header = problem.GetIncludeHeader(); const auto solutions = problem.GetSolutions(ctx.get_current_device().get_gfx_name()); From c04204af9544098a491546928371559f50cdac4b Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 4 Oct 2023 20:41:20 +0000 Subject: [PATCH 25/38] Fix header schema --- src/targets/gpu/fuse_ck.cpp | 3 ++- src/targets/gpu/include/migraphx/gpu/ck.hpp | 12 +++++++++-- .../migraphx/gpu/gemm_softmax_gemm.hpp} | 21 +++++++------------ src/targets/gpu/prefuse_ops.cpp | 3 ++- src/targets/gpu/target.cpp | 2 +- 5 files changed, 23 insertions(+), 18 deletions(-) rename src/{include/migraphx/ck.hpp => targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp} (87%) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index 1d51b8f34f3..0e17ccba943 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -22,10 +22,11 @@ * THE SOFTWARE. */ #include +#include #include #include #include -#include +#include namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { diff --git a/src/targets/gpu/include/migraphx/gpu/ck.hpp b/src/targets/gpu/include/migraphx/gpu/ck.hpp index 8ca9c49e5d9..b8f648c8d86 100644 --- a/src/targets/gpu/include/migraphx/gpu/ck.hpp +++ b/src/targets/gpu/include/migraphx/gpu/ck.hpp @@ -24,9 +24,10 @@ #ifndef MIGRAPHX_GUARD_GPU_CK_HPP #define MIGRAPHX_GUARD_GPU_CK_HPP -#include -#include #include +#include +#include +#include #include "ck/host/device_gemm_multiple_d.hpp" #include "ck/host/device_batched_gemm_softmax_gemm.hpp" @@ -35,6 +36,13 @@ namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { namespace gpu { +#ifndef _WIN32 +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG); +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK); +#endif + // NOLINTNEXTLINE const char* const disable_warning_pragma = R"__migraphx__( #pragma clang diagnostic push diff --git a/src/include/migraphx/ck.hpp b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp similarity index 87% rename from src/include/migraphx/ck.hpp rename to src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp index 17804ba4b4e..241ae8bff8d 100644 --- a/src/include/migraphx/ck.hpp +++ b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp @@ -21,22 +21,16 @@ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. */ -#ifndef MIGRAPHX_GUARD_CK_HPP -#define MIGRAPHX_GUARD_CK_HPP +#ifndef MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP +#define MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP + -#include #include #include namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { - -#ifndef _WIN32 -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_LOG_CK_GEMM); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_CK_DEBUG); -MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TUNE_CK); -#endif +namespace gpu { struct gemm_softmax_gemm { @@ -49,7 +43,7 @@ struct gemm_softmax_gemm return pack(f(self.op, "op"), f(self.scale, "scale")); } - std::string name() const { return "gemm_softmax_gemm"; } + std::string name() const { return "gpu::gemm_softmax_gemm"; } void check_gemm_shape(const shape& s) const { @@ -75,7 +69,8 @@ struct gemm_softmax_gemm static bool is_ck_supported_type(shape::type_t t) { return contains({shape::half_type}, t); } }; +} // namespace gpu + } // namespace MIGRAPHX_INLINE_NS } // namespace migraphx - -#endif +#endif // MIGRAPHX_GUARD_GPU_CK_HPP diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index 6114db7383d..e689eb7e741 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -23,11 +23,12 @@ */ #include #include +#include #include #include #include #include -#include +#include namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp index cb94d8759fb..ed1b1051654 100644 --- a/src/targets/gpu/target.cpp +++ b/src/targets/gpu/target.cpp @@ -53,7 +53,7 @@ #include #include #include -#include +#include #include #include #include From 5a3dff21e1d8fbca826e0343bc8c6c9e6ebfce56 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 4 Oct 2023 20:41:25 +0000 Subject: [PATCH 26/38] Formatting --- src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp | 1 - 1 file changed, 1 deletion(-) diff --git a/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp index 241ae8bff8d..bd4418384db 100644 --- a/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp +++ b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp @@ -24,7 +24,6 @@ #ifndef MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP #define MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP - #include #include From 321367be6ea4ed1eb596296bc32bebdc41bc968b Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 4 Oct 2023 20:44:03 +0000 Subject: [PATCH 27/38] Naming --- src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp index bd4418384db..b0b22fe0ef5 100644 --- a/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp +++ b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp @@ -72,4 +72,4 @@ struct gemm_softmax_gemm } // namespace MIGRAPHX_INLINE_NS } // namespace migraphx -#endif // MIGRAPHX_GUARD_GPU_CK_HPP +#endif // MIGRAPHX_GUARD_GPU_GEMM_SOFTMAX_GEMM_HPP From 51b6ddfbdfb70ebca580e53c6a0faf08d3ba5da8 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Tue, 10 Oct 2023 20:09:40 +0000 Subject: [PATCH 28/38] Use new embed.cmake --- requirements.txt | 2 +- src/targets/gpu/include/migraphx/gpu/ck.hpp | 16 ++++++++-------- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/requirements.txt b/requirements.txt index 7ca70feb08a..1f81b81d882 100755 --- a/requirements.txt +++ b/requirements.txt @@ -28,5 +28,5 @@ ROCmSoftwarePlatform/half@rocm-5.6.0 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On -ROCmSoftwarePlatform/composable_kernel@4b0b327b81668978249fd9b6eb1c35214e7d78ea -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On +#ROCmSoftwarePlatform/composable_kernel@761899c25989feb3c591b8fb0d0995509c3ecd20 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On ROCmSoftwarePlatform/rocMLIR@a48dfb1f163fb0b38369e73e580968b72e85b594 -DBUILD_FAT_LIBROCKCOMPILER=On diff --git a/src/targets/gpu/include/migraphx/gpu/ck.hpp b/src/targets/gpu/include/migraphx/gpu/ck.hpp index b8f648c8d86..ce1c6f7108b 100644 --- a/src/targets/gpu/include/migraphx/gpu/ck.hpp +++ b/src/targets/gpu/include/migraphx/gpu/ck.hpp @@ -28,6 +28,7 @@ #include #include #include +#include #include "ck/host/device_gemm_multiple_d.hpp" #include "ck/host/device_batched_gemm_softmax_gemm.hpp" @@ -55,17 +56,17 @@ template std::string ck_disable_warnings(P p) { return interpolate_string(disable_warning_pragma, - {{"content", std::string{p.first, p.second}}}); + {{"content", std::string{p.data(), p.size()}}}); } -static std::unordered_map create_ck_header_strings() +static std::unordered_map create_ck_header_strings() { - std::unordered_map result; + std::unordered_map result; auto ck_headers = ck::host::GetHeaders(); std::transform( - ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto&& p) { - return std::make_pair(p.first, ck_disable_warnings(p.second)); + ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto& p) { + return std::pair(p.first, ck_disable_warnings(p.second)); }); return result; } @@ -75,9 +76,8 @@ static std::vector create_ck_headers() static const auto& header_strings = create_ck_header_strings(); std::vector srcs; std::transform( - header_strings.begin(), header_strings.end(), std::back_inserter(srcs), [&](auto&& p) { - return src_file{fs::path{p.first}, - {p.second.data(), p.second.data() + p.second.size()}}; + header_strings.begin(), header_strings.end(), std::back_inserter(srcs), [&](auto& p) { + return src_file{p}; }); return srcs; } From 3aeab10754ec64f538088535711d6e48e60fe751 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Tue, 10 Oct 2023 20:09:49 +0000 Subject: [PATCH 29/38] Formatting --- src/targets/gpu/include/migraphx/gpu/ck.hpp | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/targets/gpu/include/migraphx/gpu/ck.hpp b/src/targets/gpu/include/migraphx/gpu/ck.hpp index ce1c6f7108b..595bb211705 100644 --- a/src/targets/gpu/include/migraphx/gpu/ck.hpp +++ b/src/targets/gpu/include/migraphx/gpu/ck.hpp @@ -66,7 +66,8 @@ static std::unordered_map create_ck_header_s std::transform( ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto& p) { - return std::pair(p.first, ck_disable_warnings(p.second)); + return std::pair(p.first, + ck_disable_warnings(p.second)); }); return result; } @@ -75,10 +76,10 @@ static std::vector create_ck_headers() { static const auto& header_strings = create_ck_header_strings(); std::vector srcs; - std::transform( - header_strings.begin(), header_strings.end(), std::back_inserter(srcs), [&](auto& p) { - return src_file{p}; - }); + std::transform(header_strings.begin(), + header_strings.end(), + std::back_inserter(srcs), + [&](auto& p) { return src_file{p}; }); return srcs; } From f407c8c295ac3f68051138a96911abceb7af17a1 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Tue, 10 Oct 2023 20:14:25 +0000 Subject: [PATCH 30/38] CK SHA --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 1f81b81d882..f4837c2ebb0 100755 --- a/requirements.txt +++ b/requirements.txt @@ -28,5 +28,5 @@ ROCmSoftwarePlatform/half@rocm-5.6.0 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On -#ROCmSoftwarePlatform/composable_kernel@761899c25989feb3c591b8fb0d0995509c3ecd20 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On +ROCmSoftwarePlatform/composable_kernel@d01af027c1d4a4683af02d5f19807de79b2ba14c -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On ROCmSoftwarePlatform/rocMLIR@a48dfb1f163fb0b38369e73e580968b72e85b594 -DBUILD_FAT_LIBROCKCOMPILER=On From 36ea759cf94bdc42a8c6294d1870e41ef0536d34 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 11 Oct 2023 15:46:49 +0000 Subject: [PATCH 31/38] Use string instead of string_view for embedded ck headers; use safe default instance for ck_gemm --- src/targets/gpu/include/migraphx/gpu/ck.hpp | 6 +++--- src/targets/gpu/jit/ck_gemm.cpp | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/targets/gpu/include/migraphx/gpu/ck.hpp b/src/targets/gpu/include/migraphx/gpu/ck.hpp index 595bb211705..204e28f6393 100644 --- a/src/targets/gpu/include/migraphx/gpu/ck.hpp +++ b/src/targets/gpu/include/migraphx/gpu/ck.hpp @@ -59,14 +59,14 @@ std::string ck_disable_warnings(P p) {{"content", std::string{p.data(), p.size()}}}); } -static std::unordered_map create_ck_header_strings() +static std::unordered_map create_ck_header_strings() { - std::unordered_map result; + std::unordered_map result; auto ck_headers = ck::host::GetHeaders(); std::transform( ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto& p) { - return std::pair(p.first, + return std::pair(p.first, ck_disable_warnings(p.second)); }); return result; diff --git a/src/targets/gpu/jit/ck_gemm.cpp b/src/targets/gpu/jit/ck_gemm.cpp index d23ff4fe978..7d0c9676e99 100644 --- a/src/targets/gpu/jit/ck_gemm.cpp +++ b/src/targets/gpu/jit/ck_gemm.cpp @@ -135,7 +135,7 @@ struct ck_gemm_compiler : compiler operation compile_op(context& ctx, const std::vector& inputs, const value& v) const { const auto& c_shape = inputs.back(); - auto tuning_value = v.get("tuning_value", 0); + auto tuning_value = v.get("tuning_value", 34); auto batch_count = get_batch_count(c_shape); auto problem = create_problem(inputs, v); From c451aa9ee098b5e28141c2fc0e78dda903e3fc25 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 11 Oct 2023 15:46:54 +0000 Subject: [PATCH 32/38] Formatting --- src/targets/gpu/include/migraphx/gpu/ck.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/targets/gpu/include/migraphx/gpu/ck.hpp b/src/targets/gpu/include/migraphx/gpu/ck.hpp index 204e28f6393..1b7f5ad3e81 100644 --- a/src/targets/gpu/include/migraphx/gpu/ck.hpp +++ b/src/targets/gpu/include/migraphx/gpu/ck.hpp @@ -66,8 +66,7 @@ static std::unordered_map create_ck_header_strings() std::transform( ck_headers.begin(), ck_headers.end(), std::inserter(result, result.begin()), [&](auto& p) { - return std::pair(p.first, - ck_disable_warnings(p.second)); + return std::pair(p.first, ck_disable_warnings(p.second)); }); return result; } From 9861856e043317270e096ec6f4d803fa08048480 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Wed, 11 Oct 2023 15:49:32 +0000 Subject: [PATCH 33/38] Update CK SHA --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index f4837c2ebb0..ece740db71d 100755 --- a/requirements.txt +++ b/requirements.txt @@ -28,5 +28,5 @@ ROCmSoftwarePlatform/half@rocm-5.6.0 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On -ROCmSoftwarePlatform/composable_kernel@d01af027c1d4a4683af02d5f19807de79b2ba14c -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On +ROCmSoftwarePlatform/composable_kernel@70eefcf4f263aa5c25f3c9ff0db8f6f199ef0fb9 -DCK_BUILD_JIT_LIB=On -DCMAKE_POSITION_INDEPENDENT_CODE=On ROCmSoftwarePlatform/rocMLIR@a48dfb1f163fb0b38369e73e580968b72e85b594 -DBUILD_FAT_LIBROCKCOMPILER=On From 44a77559c59d28ad8a8942c168edde94f05ad286 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Thu, 12 Oct 2023 15:17:29 +0000 Subject: [PATCH 34/38] Cppcheck --- src/include/migraphx/instruction_ref.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/migraphx/instruction_ref.hpp b/src/include/migraphx/instruction_ref.hpp index 706c082d0b3..cc62e53a746 100644 --- a/src/include/migraphx/instruction_ref.hpp +++ b/src/include/migraphx/instruction_ref.hpp @@ -57,7 +57,7 @@ struct instruction_ref : std::list::iterator std::is_same{})> friend bool operator!=(const T& x, const U& y) { - return !(x == y); + return not (x == y); } }; #else From b71fdf8cd6369cd64c6787835b32fc8a61146e4d Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Thu, 12 Oct 2023 15:17:38 +0000 Subject: [PATCH 35/38] Formatting --- src/include/migraphx/instruction_ref.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/include/migraphx/instruction_ref.hpp b/src/include/migraphx/instruction_ref.hpp index cc62e53a746..eb183fb390b 100644 --- a/src/include/migraphx/instruction_ref.hpp +++ b/src/include/migraphx/instruction_ref.hpp @@ -57,7 +57,7 @@ struct instruction_ref : std::list::iterator std::is_same{})> friend bool operator!=(const T& x, const U& y) { - return not (x == y); + return not(x == y); } }; #else From 9548894b1301ca8d520b9e3b10890906efc1e909 Mon Sep 17 00:00:00 2001 From: turneram <71655887+turneram@users.noreply.github.com> Date: Thu, 12 Oct 2023 13:10:46 -0500 Subject: [PATCH 36/38] Update src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp Co-authored-by: Umang Yadav <29876643+umangyadav@users.noreply.github.com> --- src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp index b0b22fe0ef5..f27b30659ea 100644 --- a/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp +++ b/src/targets/gpu/include/migraphx/gpu/gemm_softmax_gemm.hpp @@ -1,7 +1,7 @@ /* * The MIT License (MIT) * - * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved. + * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal From 1cd455e2d5840a0b572e325f19a014fcf13ad2ac Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Thu, 12 Oct 2023 19:50:29 +0000 Subject: [PATCH 37/38] Remove ck.hpp include where not needed --- src/targets/gpu/fuse_ck.cpp | 1 - src/targets/gpu/prefuse_ops.cpp | 1 - src/targets/gpu/target.cpp | 4 +++- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/targets/gpu/fuse_ck.cpp b/src/targets/gpu/fuse_ck.cpp index 0e17ccba943..43c7087bce7 100644 --- a/src/targets/gpu/fuse_ck.cpp +++ b/src/targets/gpu/fuse_ck.cpp @@ -26,7 +26,6 @@ #include #include #include -#include namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index e689eb7e741..daeaf26c4a7 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -28,7 +28,6 @@ #include #include #include -#include namespace migraphx { inline namespace MIGRAPHX_INLINE_NS { diff --git a/src/targets/gpu/target.cpp b/src/targets/gpu/target.cpp index ed1b1051654..faefd3ab155 100644 --- a/src/targets/gpu/target.cpp +++ b/src/targets/gpu/target.cpp @@ -53,7 +53,6 @@ #include #include #include -#include #include #include #include @@ -77,6 +76,9 @@ namespace gpu { MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS) MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION) MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC) +#ifndef _WIN32 +MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK) +#endif struct id_pass { From 99a9f234a7f0f6f71210bfd403456297eb13bbd1 Mon Sep 17 00:00:00 2001 From: Alan Turner Date: Thu, 12 Oct 2023 20:38:38 +0000 Subject: [PATCH 38/38] Add ck.hpp back to prefuse_op.cpp --- src/targets/gpu/prefuse_ops.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/src/targets/gpu/prefuse_ops.cpp b/src/targets/gpu/prefuse_ops.cpp index daeaf26c4a7..e689eb7e741 100644 --- a/src/targets/gpu/prefuse_ops.cpp +++ b/src/targets/gpu/prefuse_ops.cpp @@ -28,6 +28,7 @@ #include #include #include +#include namespace migraphx { inline namespace MIGRAPHX_INLINE_NS {