diff --git a/src/targets/gpu/compile_hip_code_object.cpp b/src/targets/gpu/compile_hip_code_object.cpp
index 74aa9d24138..d2c7dfc8fda 100644
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -139,6 +139,12 @@ void hip_compile_options::set_launch_params(
         global = compute_global(local);
 }
 
+static bool hip_accept_non_uniform_wg()
+{
+    static bool non_uniform_wg = hip_has_flags({"-fno-offload-uniform-block"});
+    return non_uniform_wg;
+}
+
 std::function<std::size_t(std::size_t local)>
 compute_global_for(context& ctx, std::size_t n, std::size_t over)
 {
@@ -146,13 +152,14 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
     std::size_t max_global = ctx.get_current_device().get_cu_count() *
                              ctx.get_current_device().get_max_workitems_per_cu();
     return [n, over, max_global](std::size_t local) {
-        // hip require global workitems multiple of local workitems. It may degrade performance.
-        // [TODO]: consider adding "fno-hip-uniform-block" flag when it becomes available.
-        // https://reviews.llvm.org/D155213
-        std::size_t num_elements = ((n + local - 1) / local) * local;
-        std::size_t groups       = (num_elements + local - 1) / local;
-        std::size_t max_blocks   = max_global / local;
-        std::size_t nglobal      = std::min(max_blocks * over, groups) * local;
+        std::size_t num_elements = n;
+        if(not hip_accept_non_uniform_wg())
+        {
+            num_elements = (1 + (n - 1) / local) * local;
+        }
+        std::size_t groups     = 1 + (num_elements - 1) / local;
+        std::size_t max_blocks = max_global / local;
+        std::size_t nglobal    = std::min(max_blocks * over, groups) * local;
         return std::min(nglobal, num_elements);
     };
 }
@@ -183,6 +190,11 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option
         generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs);
     srcs.emplace_back("args.hpp", args_hpp);
 
+    if(options.global % options.local != 0 and hip_accept_non_uniform_wg())
+        options.params += " -fno-offload-uniform-block";
+    else
+        assert(options.global % options.local == 0);
+
     options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global);
     options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local);
     options.params += " " + join_strings(compiler_warnings(), " ");
diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
index beaf645c38a..a015e02e964 100644
--- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp
@@ -31,6 +31,14 @@
 #include <migraphx/kernels/debug.hpp>
 #include <migraphx/kernels/functional.hpp>
 
+#ifdef __clang__
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+extern "C" __device__ size_t __ockl_get_enqueued_local_size(uint); // NOLINT
+extern "C" __device__ size_t __ockl_get_local_size(uint);          // NOLINT
+#pragma clang diagnostic pop
+#endif
+
 namespace migraphx {
 
 #if defined(MIGRAPHX_NGLOBAL) && defined(MIGRAPHX_NLOCAL)
@@ -45,43 +53,37 @@ inline __device__ __attribute__((const)) index_int compute_global_size()
     // This actualy works even when global is not divisible by local size.
     // This doesnt actually do a multiplicatiosn. Instead it calls a device
     // function to get the global size, which is why it works.
-    return blockDim.x * gridDim.x;  // NOLINT
+    return blockDim.x * gridDim.x; // NOLINT
 #endif
 }
 
-// We cant just use blockDim.x to get the local size since its broken on hip
-// when global is not divisible by local size. In this case, we calulate the
-// size for the last group.
+#ifdef MIGRAPHX_NGROUP
+// If global is divisible by local then local can be a const
+#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1)
+#define MIGRAPHX_HAS_CONST_LOCAL 1
+#endif
+#endif
+
 inline __device__ __attribute__((const)) index_int compute_local_size()
 {
-#ifdef MIGRAPHX_NLOCAL
-    const auto nlocal = MIGRAPHX_NLOCAL;
-#else
-    const auto nlocal = blockDim.x; // NOLINT
-#endif
-#ifdef MIGRAPHX_NGROUP
-    const auto ngroup = MIGRAPHX_NGROUP;
+#ifdef MIGRAPHX_HAS_CONST_LOCAL
+    return MIGRAPHX_NLOCAL;
 #else
-    const auto ngroup = gridDim.x;  // NOLINT
+    // Returns block size. For the non-uniform block it returns the size of the non-uniform block.
+    return __ockl_get_local_size(0); // NOLINT
 #endif
-    const auto group_id = blockIdx.x; // NOLINT
-    const auto nglobal  = compute_global_size();
-    if(group_id == ngroup - 1)
-    {
-        return 1 + (nglobal - 1) % nlocal;
-    }
-    else
-    {
-        return nlocal; // NOLINT
-    }
 }
 
-#ifdef MIGRAPHX_NGROUP
-// If global is divisible by local then local can be a const
-#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1)
-#define MIGRAPHX_HAS_CONST_LOCAL 1
-#endif
+inline __device__ __attribute__((const)) index_int compute_max_local_size()
+{
+#ifdef MIGRAPHX_LOCAL
+    return MIGRAPHX_NLOCAL;
+#else
+    // Returns the block size. When workgrop has non-uniform block, this returns size of the uniform
+    // block.
+    return __ockl_get_enqueued_local_size(0); // NOLINT
 #endif
+}
 
 struct index
 {
@@ -126,8 +128,8 @@ struct index
 #else
     __device__ index_int max_nlocal() const
     {
-        MIGRAPHX_ASSERT(blockDim.x > 0);
-        return blockDim.x;
+        MIGRAPHX_ASSERT(compute_max_local_size() > 0);
+        return compute_max_local_size();
     }
 #endif
 
@@ -249,7 +251,8 @@ struct index
 #endif
 inline __device__ __attribute__((const)) index make_index()
 {
-    return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
+    return index{
+        blockIdx.x * compute_max_local_size() + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT
 }
 
 } // namespace migraphx