diff --git a/src/targets/gpu/compile_hip_code_object.cpp b/src/targets/gpu/compile_hip_code_object.cpp index 74aa9d24138..d2c7dfc8fda 100644 --- a/src/targets/gpu/compile_hip_code_object.cpp +++ b/src/targets/gpu/compile_hip_code_object.cpp @@ -139,6 +139,12 @@ void hip_compile_options::set_launch_params( global = compute_global(local); } +static bool hip_accept_non_uniform_wg() +{ + static bool non_uniform_wg = hip_has_flags({"-fno-offload-uniform-block"}); + return non_uniform_wg; +} + std::function compute_global_for(context& ctx, std::size_t n, std::size_t over) { @@ -146,13 +152,14 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over) std::size_t max_global = ctx.get_current_device().get_cu_count() * ctx.get_current_device().get_max_workitems_per_cu(); return [n, over, max_global](std::size_t local) { - // hip require global workitems multiple of local workitems. It may degrade performance. - // [TODO]: consider adding "fno-hip-uniform-block" flag when it becomes available. - // https://reviews.llvm.org/D155213 - std::size_t num_elements = ((n + local - 1) / local) * local; - std::size_t groups = (num_elements + local - 1) / local; - std::size_t max_blocks = max_global / local; - std::size_t nglobal = std::min(max_blocks * over, groups) * local; + std::size_t num_elements = n; + if(not hip_accept_non_uniform_wg()) + { + num_elements = (1 + (n - 1) / local) * local; + } + std::size_t groups = 1 + (num_elements - 1) / local; + std::size_t max_blocks = max_global / local; + std::size_t nglobal = std::min(max_blocks * over, groups) * local; return std::min(nglobal, num_elements); }; } @@ -183,6 +190,11 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs); srcs.emplace_back("args.hpp", args_hpp); + if(options.global % options.local != 0 and hip_accept_non_uniform_wg()) + options.params += " -fno-offload-uniform-block"; + else + assert(options.global % options.local == 0); + options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global); options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local); options.params += " " + join_strings(compiler_warnings(), " "); diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp index beaf645c38a..a015e02e964 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp @@ -31,6 +31,14 @@ #include #include +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-identifier" +extern "C" __device__ size_t __ockl_get_enqueued_local_size(uint); // NOLINT +extern "C" __device__ size_t __ockl_get_local_size(uint); // NOLINT +#pragma clang diagnostic pop +#endif + namespace migraphx { #if defined(MIGRAPHX_NGLOBAL) && defined(MIGRAPHX_NLOCAL) @@ -45,43 +53,37 @@ inline __device__ __attribute__((const)) index_int compute_global_size() // This actualy works even when global is not divisible by local size. // This doesnt actually do a multiplicatiosn. Instead it calls a device // function to get the global size, which is why it works. - return blockDim.x * gridDim.x; // NOLINT + return blockDim.x * gridDim.x; // NOLINT #endif } -// We cant just use blockDim.x to get the local size since its broken on hip -// when global is not divisible by local size. In this case, we calulate the -// size for the last group. +#ifdef MIGRAPHX_NGROUP +// If global is divisible by local then local can be a const +#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1) +#define MIGRAPHX_HAS_CONST_LOCAL 1 +#endif +#endif + inline __device__ __attribute__((const)) index_int compute_local_size() { -#ifdef MIGRAPHX_NLOCAL - const auto nlocal = MIGRAPHX_NLOCAL; -#else - const auto nlocal = blockDim.x; // NOLINT -#endif -#ifdef MIGRAPHX_NGROUP - const auto ngroup = MIGRAPHX_NGROUP; +#ifdef MIGRAPHX_HAS_CONST_LOCAL + return MIGRAPHX_NLOCAL; #else - const auto ngroup = gridDim.x; // NOLINT + // Returns block size. For the non-uniform block it returns the size of the non-uniform block. + return __ockl_get_local_size(0); // NOLINT #endif - const auto group_id = blockIdx.x; // NOLINT - const auto nglobal = compute_global_size(); - if(group_id == ngroup - 1) - { - return 1 + (nglobal - 1) % nlocal; - } - else - { - return nlocal; // NOLINT - } } -#ifdef MIGRAPHX_NGROUP -// If global is divisible by local then local can be a const -#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1) -#define MIGRAPHX_HAS_CONST_LOCAL 1 -#endif +inline __device__ __attribute__((const)) index_int compute_max_local_size() +{ +#ifdef MIGRAPHX_LOCAL + return MIGRAPHX_NLOCAL; +#else + // Returns the block size. When workgrop has non-uniform block, this returns size of the uniform + // block. + return __ockl_get_enqueued_local_size(0); // NOLINT #endif +} struct index { @@ -126,8 +128,8 @@ struct index #else __device__ index_int max_nlocal() const { - MIGRAPHX_ASSERT(blockDim.x > 0); - return blockDim.x; + MIGRAPHX_ASSERT(compute_max_local_size() > 0); + return compute_max_local_size(); } #endif @@ -249,7 +251,8 @@ struct index #endif inline __device__ __attribute__((const)) index make_index() { - return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT + return index{ + blockIdx.x * compute_max_local_size() + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT } } // namespace migraphx