ROCm · causten · Oct 20, 2023 · Sep 21, 2023 · Sep 21, 2023 · Sep 26, 2023
@@ -33,6 +33,8 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK_WORKAROUNDS);
+
 void apply_quantizelinear(module& m, instruction_ref ins)
 {
     assert(ins->name() == "quantizelinear");
@@ -62,9 +64,22 @@ void apply_quantizelinear(module& m, instruction_ref ins)
         max_quant = qt.max();
         min_quant = qt.min();
     });
-    auto s        = add_zero_point->get_shape();
-    auto min_arg  = m.add_literal(literal{shape{s.type()}, {min_quant}});
-    auto max_arg  = m.add_literal(literal{shape{s.type()}, {max_quant}});
+    auto s = add_zero_point->get_shape();
+    instruction_ref min_arg;
+    instruction_ref max_arg;
+
+    if(enabled(MIGRAPHX_ENABLE_CK_WORKAROUNDS{}))
+    {
+        std::vector<int> min_data(s.elements(), min_quant);
+        std::vector<int> max_data(s.elements(), max_quant);
+        min_arg = m.add_literal(literal(s, min_data));
+        max_arg = m.add_literal(literal(s, max_data));
+    }
+    else
+    {
+        min_arg = m.add_literal(literal{shape{s.type()}, {min_quant}});
+        max_arg = m.add_literal(literal{shape{s.type()}, {max_quant}});
+    }
     auto saturate = insert_common_op(m, ins, make_op("clip"), {add_zero_point, min_arg, max_arg});
     m.replace_instruction(
         ins, make_op("convert", {{"target_type", ins->get_shape().type()}}), saturate);

@@ -26,6 +26,7 @@
 #include <migraphx/matcher.hpp>
 #include <migraphx/pass_manager.hpp>
 #include <migraphx/register_op.hpp>
+#include <migraphx/gpu/device_name.hpp>
 
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -92,6 +93,8 @@ MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
     auto m = a.lens()[a.lens().size() - 2];
     auto n = b.lens().back();
     auto k = a.lens().back();
+    auto batch_size = std::accumulate(
+        a.lens().rbegin() + 2, a.lens().rend(), std::size_t{1}, std::multiplies<std::size_t>());
     // Integer gemms must be divisible by 4 in ck
     if(contains({shape::int8_type, shape::int32_type}, ins->get_shape().type()))
     {
@@ -102,9 +105,17 @@ MIGRAPHX_PRED_MATCHER(is_ck_gemm, instruction_ref ins)
         if(k % 4 != 0)
             return false;
     }
-    // Skipping GEMMs with a K dimension greater than 2048 is a course-grained strategy
-    // to avoid poor-performing GEMM kernels from CK
-    // To-do: Investigate a more precise strategy
+    auto device_name = trim(split_string(get_device_name(), ':').front());
+    if(device_name == "gfx940")
+    {
+        if(ins->get_shape().type() == shape::half_type)
+        {
+            if(batch_size >= 64)
+                return m < 2048 or k <= 64 or n <= 384 or n >= 2048;
+            return true;
+        }
+        return true;
+    }
     return k <= 2048;
 }
 
@@ -140,6 +151,10 @@ struct find_ck_gemm_pointwise
                return not input->inputs().empty() and input->inputs().front()->name() == "capture";
            }))
             return;
+        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [](auto input) {
+               return not input->inputs().empty() and input->inputs().front()->name() == "capture";
+           }))
+            return;
         assert(gemm_it != inputs.end());
         if(gemm_idx != 0)
         {

@@ -42,11 +42,14 @@
 #include <migraphx/op/lrn.hpp>
 #include <migraphx/op/reshape.hpp>
 #include <migraphx/op/unknown.hpp>
+#include <migraphx/env.hpp>
 
 #include <migraphx/serialize.hpp>
 
 #include "test.hpp"
 
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK_WORKAROUNDS);
+
 migraphx::program optimize_onnx(const std::string& name, bool run_passes = false)
 {
     migraphx::onnx_options options;
@@ -5540,6 +5543,31 @@ TEST_CASE(qlinearmatmul_2D_test)
     EXPECT(p.sort() == prog.sort());
 }
 
+migraphx::instruction_ref insert_quantizelinear_clip(migraphx::module& m,
+                                                     const migraphx::instruction_ref ins,
+                                                     const migraphx::instruction_ref round,
+                                                     const migraphx::shape s,
+                                                     const int64_t min_quant,
+                                                     const int64_t max_quant)
+{
+    migraphx::instruction_ref min_arg;
+    migraphx::instruction_ref max_arg;
+    if(migraphx::enabled(MIGRAPHX_ENABLE_CK_WORKAROUNDS{}))
+    {
+        std::vector<int> min_data(s.elements(), min_quant);
+        std::vector<int> max_data(s.elements(), max_quant);
+        min_arg = m.add_literal(migraphx::literal(s, min_data));
+        max_arg = m.add_literal(migraphx::literal(s, max_data));
+    }
+    else
+    {
+        min_arg = m.add_literal(migraphx::literal{migraphx::shape{s.type()}, {min_quant}});
+        max_arg = m.add_literal(migraphx::literal{migraphx::shape{s.type()}, {max_quant}});
+    }
+
+    return migraphx::insert_common_op(m, ins, migraphx::make_op("clip"), {round, min_arg, max_arg});
+}
+
 TEST_CASE(quantizelinear_test)
 {
     migraphx::program p;
@@ -5548,16 +5576,10 @@ TEST_CASE(quantizelinear_test)
     auto l1  = mm->add_parameter("1", {migraphx::shape::float_type, {1}});
     auto l1_mbcast =
         mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", {5}}}), l1);
-    auto div     = mm->add_instruction(migraphx::make_op("div"), l0, l1_mbcast);
-    auto round   = mm->add_instruction(migraphx::make_op("round"), div);
-    auto s       = round->get_shape();
-    auto min_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {0}});
-    auto max_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {255}});
-    auto min_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), min_arg);
-    auto max_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), max_arg);
-    auto clip = mm->add_instruction(migraphx::make_op("clip"), round, min_mbcast, max_mbcast);
+    auto div   = mm->add_instruction(migraphx::make_op("div"), l0, l1_mbcast);
+    auto round = mm->add_instruction(migraphx::make_op("round"), div);
+    auto s     = round->get_shape();
+    auto clip  = insert_quantizelinear_clip(*mm, div, round, s, 0, 255);
     mm->add_instruction(
         migraphx::make_op("convert",
                           {{"target_type", migraphx::to_value(migraphx::shape::uint8_type)}}),
@@ -5579,16 +5601,10 @@ TEST_CASE(quantizelinear_int32_test)
         migraphx::make_op("convert",
                           {{"target_type", migraphx::to_value(migraphx::shape::float_type)}}),
         l0);
-    auto div     = mm->add_instruction(migraphx::make_op("div"), l0, l1_mbcast);
-    auto round   = mm->add_instruction(migraphx::make_op("round"), div);
-    auto s       = round->get_shape();
-    auto min_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {0}});
-    auto max_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {255}});
-    auto min_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), min_arg);
-    auto max_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), max_arg);
-    auto clip = mm->add_instruction(migraphx::make_op("clip"), round, min_mbcast, max_mbcast);
+    auto div   = mm->add_instruction(migraphx::make_op("div"), l0, l1_mbcast);
+    auto round = mm->add_instruction(migraphx::make_op("round"), div);
+    auto s     = round->get_shape();
+    auto clip  = insert_quantizelinear_clip(*mm, div, round, s, 0, 255);
     mm->add_instruction(
         migraphx::make_op("convert",
                           {{"target_type", migraphx::to_value(migraphx::shape::uint8_type)}}),
@@ -5615,15 +5631,9 @@ TEST_CASE(quantizelinear_zero_point_test)
         migraphx::make_op("convert",
                           {{"target_type", migraphx::to_value(migraphx::shape::float_type)}}),
         l2_mbcast);
-    auto add     = mm->add_instruction(migraphx::make_op("add"), round, l2_mbcast);
-    auto s       = round->get_shape();
-    auto min_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {-128}});
-    auto max_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {127}});
-    auto min_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), min_arg);
-    auto max_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), max_arg);
-    auto clip = mm->add_instruction(migraphx::make_op("clip"), add, min_mbcast, max_mbcast);
+    auto add  = mm->add_instruction(migraphx::make_op("add"), round, l2_mbcast);
+    auto s    = round->get_shape();
+    auto clip = insert_quantizelinear_clip(*mm, div, add, s, -128, 127);
     mm->add_instruction(
         migraphx::make_op("convert",
                           {{"target_type", migraphx::to_value(migraphx::shape::int8_type)}}),
@@ -5654,15 +5664,9 @@ migraphx::program make_quantizelinear_axis_prog()
         migraphx::make_op("convert",
                           {{"target_type", migraphx::to_value(migraphx::shape::float_type)}}),
         l2_bcast);
-    auto add     = mm->add_instruction(migraphx::make_op("add"), round, l2_bcast);
-    auto s       = round->get_shape();
-    auto min_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {-128}});
-    auto max_arg = mm->add_literal(migraphx::literal{migraphx::shape{s.type()}, {127}});
-    auto min_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), min_arg);
-    auto max_mbcast =
-        mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), max_arg);
-    auto clip = mm->add_instruction(migraphx::make_op("clip"), add, min_mbcast, max_mbcast);
+    auto add  = mm->add_instruction(migraphx::make_op("add"), round, l2_bcast);
+    auto s    = round->get_shape();
+    auto clip = insert_quantizelinear_clip(*mm, div, add, s, -128, 127);
     mm->add_instruction(
         migraphx::make_op("convert",
                           {{"target_type", migraphx::to_value(migraphx::shape::int8_type)}}),

@@ -31,10 +31,13 @@
 #include <migraphx/ranges.hpp>
 #include <test.hpp>
 #include <migraphx/make_op.hpp>
+#include <migraphx/env.hpp>
 
 #include <migraphx/serialize.hpp>
 #include <migraphx/pass_manager.hpp>
 
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK_WORKAROUNDS);
+
 bool is_quantizelinear(migraphx::instruction& ins) { return ins.name() == "quantizelinear"; }
 bool is_dequantizelinear(migraphx::instruction& ins) { return ins.name() == "dequantizelinear"; }
 bool is_clip_scalar(migraphx::instruction& ins)
@@ -82,7 +85,11 @@ TEST_CASE(quantizelinear)
     EXPECT(any_of(*p1.get_main_module(), &is_quantizelinear));
     EXPECT(none_of(*p2.get_main_module(), &is_quantizelinear));
     // ensure clip literals created in quantized program are scalar
-    EXPECT(any_of(*p2.get_main_module(), &is_clip_scalar));
+    // unless CK workarounds are enabled
+    if(migraphx::enabled(MIGRAPHX_ENABLE_CK_WORKAROUNDS{}))
+        EXPECT(none_of(*p2.get_main_module(), &is_clip_scalar));
+    else
+        EXPECT(any_of(*p2.get_main_module(), &is_clip_scalar));
 }
 
 TEST_CASE(dequantizelinear)