diff --git a/src/include/migraphx/argument.hpp b/src/include/migraphx/argument.hpp index 0326e460b0d..6f78d952d5c 100644 --- a/src/include/migraphx/argument.hpp +++ b/src/include/migraphx/argument.hpp @@ -46,7 +46,7 @@ struct MIGRAPHX_EXPORT argument : raw_data { argument() = default; - argument(const shape& s); + explicit argument(const shape& s); template ()())>{})> argument(shape s, F d) diff --git a/src/include/migraphx/op/allocate.hpp b/src/include/migraphx/op/allocate.hpp index 33ea6bb2260..e2670c64c11 100644 --- a/src/include/migraphx/op/allocate.hpp +++ b/src/include/migraphx/op/allocate.hpp @@ -88,13 +88,13 @@ struct allocate { if(args.empty()) { - return {output_shape}; + return argument{output_shape}; } else { std::vector output_dims(output_shape.ndim()); args.at(0).visit([&](auto a) { output_dims.assign(a.begin(), a.end()); }); - return {shape{buf_type, output_dims}}; + return argument{shape{buf_type, output_dims}}; } } }; diff --git a/src/include/migraphx/op/pooling.hpp b/src/include/migraphx/op/pooling.hpp index 276ad3295fe..7bfe456f3a0 100644 --- a/src/include/migraphx/op/pooling.hpp +++ b/src/include/migraphx/op/pooling.hpp @@ -411,7 +411,7 @@ struct pooling // for dynamic GlobalPooling, there's no padding kernel_dims.insert(kernel_dims.end(), input_lens.begin() + 2, input_lens.end()); output_shape = dyn_out.computed_shape; - result = dyn_out.computed_shape; + result = argument{dyn_out.computed_shape}; } else if((padding_mode != op::padding_mode_t::default_)) { @@ -439,7 +439,7 @@ struct pooling { kernel_dims = this->lengths; output_shape = dyn_out.computed_shape; - result = dyn_out.computed_shape; + result = argument{dyn_out.computed_shape}; } // Perform the computation and populate result diff --git a/src/targets/gpu/compile_hip_code_object.cpp b/src/targets/gpu/compile_hip_code_object.cpp index 74aa9d24138..d2c7dfc8fda 100644 --- a/src/targets/gpu/compile_hip_code_object.cpp +++ b/src/targets/gpu/compile_hip_code_object.cpp @@ -139,6 +139,12 @@ void hip_compile_options::set_launch_params( global = compute_global(local); } +static bool hip_accept_non_uniform_wg() +{ + static bool non_uniform_wg = hip_has_flags({"-fno-offload-uniform-block"}); + return non_uniform_wg; +} + std::function compute_global_for(context& ctx, std::size_t n, std::size_t over) { @@ -146,13 +152,14 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over) std::size_t max_global = ctx.get_current_device().get_cu_count() * ctx.get_current_device().get_max_workitems_per_cu(); return [n, over, max_global](std::size_t local) { - // hip require global workitems multiple of local workitems. It may degrade performance. - // [TODO]: consider adding "fno-hip-uniform-block" flag when it becomes available. - // https://reviews.llvm.org/D155213 - std::size_t num_elements = ((n + local - 1) / local) * local; - std::size_t groups = (num_elements + local - 1) / local; - std::size_t max_blocks = max_global / local; - std::size_t nglobal = std::min(max_blocks * over, groups) * local; + std::size_t num_elements = n; + if(not hip_accept_non_uniform_wg()) + { + num_elements = (1 + (n - 1) / local) * local; + } + std::size_t groups = 1 + (num_elements - 1) / local; + std::size_t max_blocks = max_global / local; + std::size_t nglobal = std::min(max_blocks * over, groups) * local; return std::min(nglobal, num_elements); }; } @@ -183,6 +190,11 @@ operation compile_hip_code_object(const std::string& content, hip_compile_option generate_args_hpp(options.virtual_inputs.empty() ? options.inputs : options.virtual_inputs); srcs.emplace_back("args.hpp", args_hpp); + if(options.global % options.local != 0 and hip_accept_non_uniform_wg()) + options.params += " -fno-offload-uniform-block"; + else + assert(options.global % options.local == 0); + options.params += " -DMIGRAPHX_NGLOBAL=" + std::to_string(options.global); options.params += " -DMIGRAPHX_NLOCAL=" + std::to_string(options.local); options.params += " " + join_strings(compiler_warnings(), " "); diff --git a/src/targets/gpu/include/migraphx/gpu/convolution.hpp b/src/targets/gpu/include/migraphx/gpu/convolution.hpp index d6680f17ec8..f88cee86855 100644 --- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp +++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp @@ -199,9 +199,9 @@ struct miopen_convolution // MIOpen has APIs to pass pre-allocated buffers starting from rocm-5.6 preallocate = true; #endif - auto x = preallocate ? to_gpu(generate_argument(x_shape)) : inputs[0]; - auto w = preallocate ? to_gpu(generate_argument(w_shape)) : inputs[1]; - auto y = preallocate ? allocate_gpu(output_shape) : inputs[2]; + auto x = preallocate ? to_gpu(generate_argument(x_shape)) : argument{inputs[0]}; + auto w = preallocate ? to_gpu(generate_argument(w_shape)) : argument{inputs[1]}; + auto y = preallocate ? allocate_gpu(output_shape) : argument{inputs[2]}; auto workspace = preallocate ? allocate_gpu(workspace_shape) : migraphx::argument(workspace_shape); diff --git a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp index beaf645c38a..a015e02e964 100644 --- a/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp +++ b/src/targets/gpu/kernels/include/migraphx/kernels/index.hpp @@ -31,6 +31,14 @@ #include #include +#ifdef __clang__ +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreserved-identifier" +extern "C" __device__ size_t __ockl_get_enqueued_local_size(uint); // NOLINT +extern "C" __device__ size_t __ockl_get_local_size(uint); // NOLINT +#pragma clang diagnostic pop +#endif + namespace migraphx { #if defined(MIGRAPHX_NGLOBAL) && defined(MIGRAPHX_NLOCAL) @@ -45,43 +53,37 @@ inline __device__ __attribute__((const)) index_int compute_global_size() // This actualy works even when global is not divisible by local size. // This doesnt actually do a multiplicatiosn. Instead it calls a device // function to get the global size, which is why it works. - return blockDim.x * gridDim.x; // NOLINT + return blockDim.x * gridDim.x; // NOLINT #endif } -// We cant just use blockDim.x to get the local size since its broken on hip -// when global is not divisible by local size. In this case, we calulate the -// size for the last group. +#ifdef MIGRAPHX_NGROUP +// If global is divisible by local then local can be a const +#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1) +#define MIGRAPHX_HAS_CONST_LOCAL 1 +#endif +#endif + inline __device__ __attribute__((const)) index_int compute_local_size() { -#ifdef MIGRAPHX_NLOCAL - const auto nlocal = MIGRAPHX_NLOCAL; -#else - const auto nlocal = blockDim.x; // NOLINT -#endif -#ifdef MIGRAPHX_NGROUP - const auto ngroup = MIGRAPHX_NGROUP; +#ifdef MIGRAPHX_HAS_CONST_LOCAL + return MIGRAPHX_NLOCAL; #else - const auto ngroup = gridDim.x; // NOLINT + // Returns block size. For the non-uniform block it returns the size of the non-uniform block. + return __ockl_get_local_size(0); // NOLINT #endif - const auto group_id = blockIdx.x; // NOLINT - const auto nglobal = compute_global_size(); - if(group_id == ngroup - 1) - { - return 1 + (nglobal - 1) % nlocal; - } - else - { - return nlocal; // NOLINT - } } -#ifdef MIGRAPHX_NGROUP -// If global is divisible by local then local can be a const -#if(MIGRAPHX_NGLOBAL % MIGRAPHX_NLOCAL == 0) || (MIGRAPHX_NGROUP == 1) -#define MIGRAPHX_HAS_CONST_LOCAL 1 -#endif +inline __device__ __attribute__((const)) index_int compute_max_local_size() +{ +#ifdef MIGRAPHX_LOCAL + return MIGRAPHX_NLOCAL; +#else + // Returns the block size. When workgrop has non-uniform block, this returns size of the uniform + // block. + return __ockl_get_enqueued_local_size(0); // NOLINT #endif +} struct index { @@ -126,8 +128,8 @@ struct index #else __device__ index_int max_nlocal() const { - MIGRAPHX_ASSERT(blockDim.x > 0); - return blockDim.x; + MIGRAPHX_ASSERT(compute_max_local_size() > 0); + return compute_max_local_size(); } #endif @@ -249,7 +251,8 @@ struct index #endif inline __device__ __attribute__((const)) index make_index() { - return index{blockIdx.x * blockDim.x + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT + return index{ + blockIdx.x * compute_max_local_size() + threadIdx.x, threadIdx.x, blockIdx.x}; // NOLINT } } // namespace migraphx diff --git a/test/eliminate_allocation_test.cpp b/test/eliminate_allocation_test.cpp index 2bfc7a54809..ba1179d1be7 100644 --- a/test/eliminate_allocation_test.cpp +++ b/test/eliminate_allocation_test.cpp @@ -55,7 +55,7 @@ struct allocate const migraphx::shape& output_shape, const std::vector&) const { - return {output_shape}; + return migraphx::argument{output_shape}; } }; diff --git a/test/eliminate_concat_test.cpp b/test/eliminate_concat_test.cpp index 13984e98645..dc2834bfa91 100644 --- a/test/eliminate_concat_test.cpp +++ b/test/eliminate_concat_test.cpp @@ -60,7 +60,7 @@ struct concat const migraphx::shape& output_shape, const std::vector&) const { - return {output_shape}; + return migraphx::argument{output_shape}; } }; @@ -104,7 +104,7 @@ struct allocate const migraphx::shape& output_shape, const std::vector&) const { - return {output_shape}; + return migraphx::argument{output_shape}; } }; diff --git a/test/memory_coloring_test.cpp b/test/memory_coloring_test.cpp index 7716c8b89a8..7cbb3efdec6 100644 --- a/test/memory_coloring_test.cpp +++ b/test/memory_coloring_test.cpp @@ -55,7 +55,7 @@ struct allocate const migraphx::shape& output_shape, const std::vector&) const { - return {output_shape}; + return migraphx::argument{output_shape}; } }; diff --git a/test/normalize_ops_test.cpp b/test/normalize_ops_test.cpp index f9ec2f033c2..a48223dd5fe 100644 --- a/test/normalize_ops_test.cpp +++ b/test/normalize_ops_test.cpp @@ -57,7 +57,7 @@ struct normalize_test_op const migraphx::shape& output_shape, const std::vector&) const { - return {output_shape}; + return migraphx::argument{output_shape}; } }; diff --git a/test/replace_allocate.cpp b/test/replace_allocate.cpp index 90b8a943973..68e3cfd5d37 100644 --- a/test/replace_allocate.cpp +++ b/test/replace_allocate.cpp @@ -54,7 +54,7 @@ struct allocate_no_out : migraphx::auto_register_op const migraphx::shape& output_shape, const std::vector&) const { - return {output_shape}; + return migraphx::argument{output_shape}; } }; @@ -78,7 +78,7 @@ struct allocate_with_out : migraphx::auto_register_op const migraphx::shape& output_shape, const std::vector&) const { - return {output_shape}; + return migraphx::argument{output_shape}; } }; diff --git a/tools/accuracy/accuracy_checker.py b/tools/accuracy/accuracy_checker.py index d368ca2a29e..8752bbe7f78 100644 --- a/tools/accuracy/accuracy_checker.py +++ b/tools/accuracy/accuracy_checker.py @@ -220,10 +220,16 @@ def main(): else: test_input = np.zeros(in_shape).astype(get_np_datatype(in_type)) test_inputs[name] = test_input - params[name] = migraphx.argument(test_input) + migraphx_arg = migraphx.argument(test_input) + if not args.offload_copy: + migraphx_arg = migraphx.to_gpu(migraphx_arg) + params[name] = migraphx_arg if not args.ort_run: - pred_migx = np.array(model.run(params)[-1]) + if not args.offload_copy: + pred_migx = np.array(migraphx.from_gpu(model.run(params)[-1])) + else: + pred_migx = np.array(model.run(params)[-1]) if use_onnx: sess_op = ort.SessionOptions()