diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 3fbdc9debf5..1f87e50f722 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1180,18 +1180,15 @@ struct __parallel_find_or_nd_range_tuner 1) { - auto __current_iters_per_work_item = - oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size); - // Empirically found formula for GPU devices. const auto __rng_x = __rng_n / 4096; const auto __desired_iters_per_work_item = std::max(std::sqrt(__rng_x), 1.); - if (__current_iters_per_work_item < __desired_iters_per_work_item) + if (__iters_per_work_item < __desired_iters_per_work_item) { - auto __k = __desired_iters_per_work_item / __current_iters_per_work_item; - __k = std::pow(2, std::ceil(std::log2(__k))); - __n_groups = (std::size_t)std::ceil(__n_groups / __k); + auto __k = oneapi::dpl::__internal::__dpl_bit_ceil( + (std::size_t)std::floor(__desired_iters_per_work_item / __iters_per_work_item)); + __n_groups = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, __k); assert(oneapi::dpl::__internal::__dpl_ceiling_div(__rng_n, __n_groups * __wgroup_size) <= __desired_iters_per_work_item);