Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add code change workaround for 64-bit reduce_by_segment bug #1791

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 0 additions & 10 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -287,16 +287,6 @@ if (ONEDPL_BACKEND MATCHES "^(tbb|dpcpp|dpcpp_only)$")
endif()
endif()

if (DEFINED ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
if(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1 option")
target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1")
else()
message(STATUS "Adding -DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0 option")
target_compile_options(oneDPL INTERFACE "-DONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=0")
endif()
endif()

# DPC++ specific macro
target_compile_definitions(oneDPL INTERFACE
$<$<OR:$<BOOL:${ONEDPL_USE_DEVICE_FPGA_HW}>,$<BOOL:${ONEDPL_USE_DEVICE_FPGA_EMU}>>:ONEDPL_FPGA_DEVICE>
Expand Down
8 changes: 6 additions & 2 deletions include/oneapi/dpl/internal/reduce_by_segment_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -351,7 +351,12 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy
__seg_reduce_wg_kernel,
#endif
sycl::nd_range<1>{__n_groups * __wgroup_size, __wgroup_size}, [=](sycl::nd_item<1> __item) {
::std::array<__val_type, __vals_per_item> __loc_partials;
auto __identity = unseq_backend::__known_identity<_BinaryOperator, __val_type>;
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's use __val_type instead of auto - it's will more readable, I think.

// TODO: Remove this initialization to the identity when possible. We load real data to __loc_partials
// in the first loop below but this initialization to the identity works around an IGC register
// filling bug.
std::array<__val_type, __vals_per_item> __loc_partials;
std::fill(__loc_partials.begin(), __loc_partials.end(), __identity);

auto __group = __item.get_group();
::std::size_t __group_id = __item.get_group(0);
Expand All @@ -368,7 +373,6 @@ __sycl_reduce_by_segment(__internal::__hetero_tag<_BackendTag>, _ExecutionPolicy

::std::size_t __max_end = 0;
::std::size_t __item_segments = 0;
auto __identity = unseq_backend::__known_identity<_BinaryOperator, __val_type>;

__val_type __accumulator = __identity;
for (::std::size_t __i = __start; __i < __end; ++__i)
Expand Down
17 changes: 3 additions & 14 deletions include/oneapi/dpl/pstl/hetero/dpcpp/unseq_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,21 +34,11 @@ namespace unseq_backend
//This optimization depends on Intel(R) oneAPI DPC++ Compiler implementation such as support of binary operators from std namespace.
//We need to use defined(SYCL_IMPLEMENTATION_INTEL) macro as a guard.

template <typename _Tp>
inline constexpr bool __can_use_known_identity =
# if ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it okay if we directly remove this macro check, or should it be deprecated first with a #warning or something similar? From the user's perspective, they should just see reduce_by_segment speedup with 64-bit types.

// When ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION is defined as non-zero, we avoid using known identity for 64-bit arithmetic data types
!(::std::is_arithmetic_v<_Tp> && sizeof(_Tp) == sizeof(::std::uint64_t));
# else
true;
# endif // ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION

//TODO: To change __has_known_identity implementation as soon as the Intel(R) oneAPI DPC++ Compiler implementation issues related to
//std::multiplies, std::bit_or, std::bit_and and std::bit_xor operations will be fixed.
//std::logical_and and std::logical_or are not supported in Intel(R) oneAPI DPC++ Compiler to be used in sycl::inclusive_scan_over_group and sycl::reduce_over_group
template <typename _BinaryOp, typename _Tp>
using __has_known_identity = ::std::conditional_t<
__can_use_known_identity<_Tp>,
using __has_known_identity =
# if _ONEDPL_LIBSYCL_VERSION >= 50200
typename ::std::disjunction<
__dpl_sycl::__has_known_identity<_BinaryOp, _Tp>,
Expand All @@ -60,16 +50,15 @@ using __has_known_identity = ::std::conditional_t<
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<_Tp>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__minimum<void>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<_Tp>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<void>>>>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__maximum<void>>>>>;
# else //_ONEDPL_LIBSYCL_VERSION >= 50200
typename ::std::conjunction<
::std::is_arithmetic<_Tp>,
::std::disjunction<::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<_Tp>>,
::std::is_same<::std::decay_t<_BinaryOp>, ::std::plus<void>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<_Tp>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<void>>>>,
::std::is_same<::std::decay_t<_BinaryOp>, __dpl_sycl::__plus<void>>>>;
# endif //_ONEDPL_LIBSYCL_VERSION >= 50200
::std::false_type>; // This is for the case of __can_use_known_identity<_Tp>==false

#else //_USE_GROUP_ALGOS && defined(SYCL_IMPLEMENTATION_INTEL)

Expand Down
5 changes: 0 additions & 5 deletions test/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,6 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators)
string(REPLACE "\.cpp" "" _test_name ${_test_name})

set(coal_tests "reduce.pass" "transform_reduce.pass" "count.pass" "sycl_iterator_reduce.pass" "minmax_element.pass")
set(workaround_for_igpu_64bit_reduction_tests "reduce_by_segment.pass")
# mark those tests with pstloffload_smoke_tests label
set (pstloffload_smoke_tests "adjacent_find.pass" "copy_move.pass" "merge.pass" "partial_sort.pass" "remove_copy.pass"
"transform_reduce.pass" "transform_reduce.pass.coal" "transform_scan.pass" "algorithm.pass"
Expand All @@ -210,10 +209,6 @@ macro(onedpl_add_test test_source_file switch_off_checked_iterators)
if (_test_name IN_LIST coal_tests)
onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=1" "${extra_test_label}")
onedpl_construct_exec(${test_source_file} ${_test_name}.coal ${switch_off_checked_iterators} "-D_ONEDPL_DETECT_SPIRV_COMPILATION=0" "${extra_test_label}")
elseif (_test_name IN_LIST workaround_for_igpu_64bit_reduction_tests)
onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}")
string(REPLACE "\.pass" "_workaround_64bit_reduction\.pass" _test_name ${_test_name})
onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "-D_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION=1" "${extra_test_label}")
elseif(_test_name STREQUAL "free_after_unload.pass")
onedpl_construct_exec(${test_source_file} ${_test_name} ${switch_off_checked_iterators} "" "${extra_test_label}")
onedpl_construct_exec(${test_source_file} ${_test_name}.after_pstl_offload ${switch_off_checked_iterators} "" "${extra_test_label}")
Expand Down
24 changes: 5 additions & 19 deletions test/parallel_api/numeric/numeric.ops/reduce_by_segment.pass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -13,14 +13,6 @@
//
//===----------------------------------------------------------------------===//

#if defined(ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
#undef ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
#endif

#if defined(_ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION)
# define ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION _ONEDPL_TEST_FORCE_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
#endif

#include "support/test_config.h"

#include "oneapi/dpl/execution"
Expand Down Expand Up @@ -306,18 +298,12 @@ void
run_test_on_device()
{
#if TEST_DPCPP_BACKEND_PRESENT
// Skip 64-byte types testing when the algorithm is broken and there is no the workaround
#if _PSTL_ICPX_TEST_RED_BY_SEG_BROKEN_64BIT_TYPES && !ONEDPL_WORKAROUND_FOR_IGPU_64BIT_REDUCTION
if constexpr (sizeof(ValueType) != 8)
#endif
if (TestUtils::has_type_support<ValueType>(TestUtils::get_test_queue().get_device()))
{
if (TestUtils::has_type_support<ValueType>(TestUtils::get_test_queue().get_device()))
{
// Run tests for USM shared memory
test4buffers<sycl::usm::alloc::shared, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
// Run tests for USM device memory
test4buffers<sycl::usm::alloc::device, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
}
// Run tests for USM shared memory
test4buffers<sycl::usm::alloc::shared, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
// Run tests for USM device memory
test4buffers<sycl::usm::alloc::device, test_reduce_by_segment<ValueType, BinaryPredicate, BinaryOperation>>();
}
#endif // TEST_DPCPP_BACKEND_PRESENT
}
Expand Down
Loading