From 6e86734d12618328235748b2d54c9de1e3f203c7 Mon Sep 17 00:00:00 2001
From: kahmed10 <15948690+kahmed10@users.noreply.github.com>
Date: Wed, 18 Oct 2023 21:08:47 -0500
Subject: [PATCH 1/3] update script when offload copy is disabled (#2348)

---
 tools/accuracy/accuracy_checker.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/tools/accuracy/accuracy_checker.py b/tools/accuracy/accuracy_checker.py
index d368ca2a29e..8752bbe7f78 100644
--- a/tools/accuracy/accuracy_checker.py
+++ b/tools/accuracy/accuracy_checker.py
@@ -220,10 +220,16 @@ def main():
         else:
             test_input = np.zeros(in_shape).astype(get_np_datatype(in_type))
         test_inputs[name] = test_input
-        params[name] = migraphx.argument(test_input)
+        migraphx_arg = migraphx.argument(test_input)
+        if not args.offload_copy:
+            migraphx_arg = migraphx.to_gpu(migraphx_arg)
+        params[name] = migraphx_arg
 
     if not args.ort_run:
-        pred_migx = np.array(model.run(params)[-1])
+        if not args.offload_copy:
+            pred_migx = np.array(migraphx.from_gpu(model.run(params)[-1]))
+        else:
+            pred_migx = np.array(model.run(params)[-1])
 
     if use_onnx:
         sess_op = ort.SessionOptions()

From 581b1b5f2953feb6d8e9dcbb15d7b4e414097ae4 Mon Sep 17 00:00:00 2001
From: Umang Yadav <29876643+umangyadav@users.noreply.github.com>
Date: Thu, 19 Oct 2023 10:25:00 -0400
Subject: [PATCH 2/3] Add flag to accept non-uniform WG sizes (#2167)

* Disable -Wunsafe-buffer-usage when compiling gpu code
---
 src/targets/gpu/compile_hip_code_object.cpp   | 26 +++++---
 .../include/migraphx/kernels/index.hpp        | 63 ++++++++++---------
 2 files changed, 52 insertions(+), 37 deletions(-)

diff --git a/src/targets/gpu/compile_hip_code_object.cpp b/src/targets/gpu/compile_hip_code_object.cpp
index 74aa9d24138..d2c7dfc8fda 100644
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -139,6 +139,12 @@ void hip_compile_options::set_launch_params(
         global = compute_global(local);
 }
 
+static bool hip_accept_non_uniform_wg()
+{
+    static bool non_uniform_wg = hip_has_flags({"-fno-offload-uniform-block"});
+    return non_uniform_wg;
+}
+
 std::function<std::size_t(std::size_t local)>
 compute_global_for(context& ctx, std::size_t n, std::size_t over)
 {
@@ -146,13 +152,14 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
     std::size_t max_global = ctx.get_current_device().get_cu_count() *
                              ctx.get_current_device().get_max_workitems_per_cu();
     return [n, over, max_global](std::size_t local) {
-        // hip require global workitems multiple of local workitems. It may degrade performance.
-        // [TODO]: consider adding "fno-hip-uniform-block" flag when it becomes available.
-        // https://reviews.llvm.org/D155213
-        std::size_t num_elements = ((n + local - 1) / local) * local;
-        std::size_t groups       = (num_elements + local - 1) / local;
-        std::size_t max_blocks   = max_global / local;
-        std::size_t nglobal      = std::min(max_blocks * over, groups) * local;
+        std::size_t num_elements = n;
+        if(not hip_accept_non_uniform_wg())
+        {
+            num_elements = (1 + (n - 1) / local) * local;
+        }
+        std::size_t groups     = 1 + (num_elements - 1) / local;
+        std::size_t max_blocks = max_global / local;
+        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
         return std::min(nglobal, num_elements);
     };
 }
@@ -183,6 +190,11 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
         generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
     srcs.emplace_back("args.hpp", args_hpp);
 
+    if(options.global % options.local != 0 and hip_accept_non_uniform_wg())
+        options.params += " -fno-offload-uniform-block";
+    else
+        assert(options.global % options.local == 0);
+
     options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
     options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local);
     options.params += " " + join_strings(compiler_warnings(), " ");
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
index beaf645c38a..a015e02e964 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -31,6 +31,14 @@
 #include <migraphx/kernels/debug.hpp>
 #include <migraphx/kernels/functional.hpp>
 
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+extern "C" __device__ size_t __ockl_get_enqueued_local_size(uint); // NOLINT
+extern "C" __device__ size_t __ockl_get_local_size(uint);          // NOLINT
+#pragma clang diagnostic pop
+#endif
+
 namespace migraphx {
 
 #if defined(MIGRAPHX_NGLOBAL) && defined(MIGRAPHX_NLOCAL)
@@ -45,43 +53,37 @@ inline __device__ __attribute__((const)) index_int compute_global_size()
     // This actualy works even when global is not divisible by local size.
     // This doesnt actually do a multiplicatiosn. Instead it calls a device
     // function to get the global size, which is why it works.
-    return blockDim.x * gridDim.x;  // NOLINT
+    return blockDim.x * gridDim.x; // NOLINT
 #endif
 }
 
-// We cant just use blockDim.x to get the local size since its broken on hip
-// when global is not divisible by local size. In this case, we calulate the
-// size for the last group.
+#ifdef MIGRAPHX_NGROUP
+// If global is divisible by local then local can be a const
+#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1)
+#define MIGRAPHX_HAS_CONST_LOCAL 1
+#endif
+#endif
+
 inline __device__ __attribute__((const)) index_int compute_local_size()
 {
-#ifdef MIGRAPHX_NLOCAL
-    const auto nlocal = MIGRAPHX_NLOCAL;
-#else
-    const auto nlocal = blockDim.x; // NOLINT
-#endif
-#ifdef MIGRAPHX_NGROUP
-    const auto ngroup = MIGRAPHX_NGROUP;
+#ifdef MIGRAPHX_HAS_CONST_LOCAL
+    return MIGRAPHX_NLOCAL;
 #else
-    const auto ngroup = gridDim.x;  // NOLINT
+    // Returns block size. For the non-uniform block it returns the size of the non-uniform block.
+    return __ockl_get_local_size(0); // NOLINT
 #endif
-    const auto group_id = blockIdx.x; // NOLINT
-    const auto nglobal  = compute_global_size();
-    if(group_id == ngroup - 1)
-    {
-        return 1 + (nglobal - 1) % nlocal;
-    }
-    else
-    {
-        return nlocal; // NOLINT
-    }
 }
 
-#ifdef MIGRAPHX_NGROUP
-// If global is divisible by local then local can be a const
-#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1)
-#define MIGRAPHX_HAS_CONST_LOCAL 1
-#endif
+inline __device__ __attribute__((const)) index_int compute_max_local_size()
+{
+#ifdef MIGRAPHX_LOCAL
+    return MIGRAPHX_NLOCAL;
+#else
+    // Returns the block size. When workgrop has non-uniform block, this returns size of the uniform
+    // block.
+    return __ockl_get_enqueued_local_size(0); // NOLINT
 #endif
+}
 
 struct index
 {
@@ -126,8 +128,8 @@ struct index
 #else
     __device__ index_int max_nlocal() const
     {
-        MIGRAPHX_ASSERT(blockDim.x > 0);
-        return blockDim.x;
+        MIGRAPHX_ASSERT(compute_max_local_size() > 0);
+        return compute_max_local_size();
     }
 #endif
 
@@ -249,7 +251,8 @@ struct index
 #endif
 inline __device__ __attribute__((const)) index make_index()
 {
-    return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
+    return index{
+        blockIdx.x * compute_max_local_size() + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
 }
 
 } // namespace migraphx

From 07848b28dbc135f34a4c1bf57dc9af039d7b31c6 Mon Sep 17 00:00:00 2001
From: Paul Fultz II <pfultz2@yahoo.com>
Date: Thu, 19 Oct 2023 09:25:56 -0500
Subject: [PATCH 3/3] Make argument constructor explicit (#2346)

---
 src/include/migraphx/argument.hpp                    | 2 +-
 src/include/migraphx/op/allocate.hpp                 | 4 ++--
 src/include/migraphx/op/pooling.hpp                  | 4 ++--
 src/targets/gpu/include/migraphx/gpu/convolution.hpp | 6 +++---
 test/eliminate_allocation_test.cpp                   | 2 +-
 test/eliminate_concat_test.cpp                       | 4 ++--
 test/memory_coloring_test.cpp                        | 2 +-
 test/normalize_ops_test.cpp                          | 2 +-
 test/replace_allocate.cpp                            | 4 ++--
 9 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/src/include/migraphx/argument.hpp b/src/include/migraphx/argument.hpp
index 0326e460b0d..6f78d952d5c 100644
--- a/src/include/migraphx/argument.hpp
+++ b/src/include/migraphx/argument.hpp
@@ -46,7 +46,7 @@ struct MIGRAPHX_EXPORT argument : raw_data<argument>
 {
     argument() = default;
 
-    argument(const shape& s);
+    explicit argument(const shape& s);
 
     template <class F, MIGRAPHX_REQUIRES(std::is_pointer<decltype(std::declval<F>()())>{})>
     argument(shape s, F d)
diff --git a/src/include/migraphx/op/allocate.hpp b/src/include/migraphx/op/allocate.hpp
index 33ea6bb2260..e2670c64c11 100644
--- a/src/include/migraphx/op/allocate.hpp
+++ b/src/include/migraphx/op/allocate.hpp
@@ -88,13 +88,13 @@ struct allocate
     {
         if(args.empty())
         {
-            return {output_shape};
+            return argument{output_shape};
         }
         else
         {
             std::vector<std::size_t> output_dims(output_shape.ndim());
             args.at(0).visit([&](auto a) { output_dims.assign(a.begin(), a.end()); });
-            return {shape{buf_type, output_dims}};
+            return argument{shape{buf_type, output_dims}};
         }
     }
 };
diff --git a/src/include/migraphx/op/pooling.hpp b/src/include/migraphx/op/pooling.hpp
index 276ad3295fe..7bfe456f3a0 100644
--- a/src/include/migraphx/op/pooling.hpp
+++ b/src/include/migraphx/op/pooling.hpp
@@ -411,7 +411,7 @@ struct pooling
             // for dynamic GlobalPooling, there's no padding
             kernel_dims.insert(kernel_dims.end(), input_lens.begin() + 2, input_lens.end());
             output_shape = dyn_out.computed_shape;
-            result       = dyn_out.computed_shape;
+            result       = argument{dyn_out.computed_shape};
         }
         else if((padding_mode != op::padding_mode_t::default_))
         {
@@ -439,7 +439,7 @@ struct pooling
         {
             kernel_dims  = this->lengths;
             output_shape = dyn_out.computed_shape;
-            result       = dyn_out.computed_shape;
+            result       = argument{dyn_out.computed_shape};
         }
 
         // Perform the computation and populate result
diff --git a/src/targets/gpu/include/migraphx/gpu/convolution.hpp b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
index d6680f17ec8..f88cee86855 100644
--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -199,9 +199,9 @@ struct miopen_convolution
             // MIOpen has APIs to pass pre-allocated buffers starting from rocm-5.6
             preallocate = true;
 #endif
-            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : inputs[0];
-            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : inputs[1];
-            auto y = preallocate ? allocate_gpu(output_shape) : inputs[2];
+            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : argument{inputs[0]};
+            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : argument{inputs[1]};
+            auto y = preallocate ? allocate_gpu(output_shape) : argument{inputs[2]};
             auto workspace =
                 preallocate ? allocate_gpu(workspace_shape) : migraphx::argument(workspace_shape);
 
diff --git a/test/eliminate_allocation_test.cpp b/test/eliminate_allocation_test.cpp
index 2bfc7a54809..ba1179d1be7 100644
--- a/test/eliminate_allocation_test.cpp
+++ b/test/eliminate_allocation_test.cpp
@@ -55,7 +55,7 @@ struct allocate
                                const migraphx::shape& output_shape,
                                const std::vector<migraphx::argument>&) const
     {
-        return {output_shape};
+        return migraphx::argument{output_shape};
     }
 };
 
diff --git a/test/eliminate_concat_test.cpp b/test/eliminate_concat_test.cpp
index 13984e98645..dc2834bfa91 100644
--- a/test/eliminate_concat_test.cpp
+++ b/test/eliminate_concat_test.cpp
@@ -60,7 +60,7 @@ struct concat
                                const migraphx::shape& output_shape,
                                const std::vector<migraphx::argument>&) const
     {
-        return {output_shape};
+        return migraphx::argument{output_shape};
     }
 };
 
@@ -104,7 +104,7 @@ struct allocate
                                const migraphx::shape& output_shape,
                                const std::vector<migraphx::argument>&) const
     {
-        return {output_shape};
+        return migraphx::argument{output_shape};
     }
 };
 
diff --git a/test/memory_coloring_test.cpp b/test/memory_coloring_test.cpp
index 7716c8b89a8..7cbb3efdec6 100644
--- a/test/memory_coloring_test.cpp
+++ b/test/memory_coloring_test.cpp
@@ -55,7 +55,7 @@ struct allocate
                                const migraphx::shape& output_shape,
                                const std::vector<migraphx::argument>&) const
     {
-        return {output_shape};
+        return migraphx::argument{output_shape};
     }
 };
 
diff --git a/test/normalize_ops_test.cpp b/test/normalize_ops_test.cpp
index f9ec2f033c2..a48223dd5fe 100644
--- a/test/normalize_ops_test.cpp
+++ b/test/normalize_ops_test.cpp
@@ -57,7 +57,7 @@ struct normalize_test_op
                                const migraphx::shape& output_shape,
                                const std::vector<migraphx::argument>&) const
     {
-        return {output_shape};
+        return migraphx::argument{output_shape};
     }
 };
 
diff --git a/test/replace_allocate.cpp b/test/replace_allocate.cpp
index 90b8a943973..68e3cfd5d37 100644
--- a/test/replace_allocate.cpp
+++ b/test/replace_allocate.cpp
@@ -54,7 +54,7 @@ struct allocate_no_out : migraphx::auto_register_op<allocate_no_out>
                                const migraphx::shape& output_shape,
                                const std::vector<migraphx::argument>&) const
     {
-        return {output_shape};
+        return migraphx::argument{output_shape};
     }
 };
 
@@ -78,7 +78,7 @@ struct allocate_with_out : migraphx::auto_register_op<allocate_with_out>
                                const migraphx::shape& output_shape,
                                const std::vector<migraphx::argument>&) const
     {
-        return {output_shape};
+        return migraphx::argument{output_shape};
     }
 };