Skip to content

Commit

Permalink
[oneDPL] Specify access mode in __result_and_scratch_storage methods (
Browse files Browse the repository at this point in the history
  • Loading branch information
SergeyKopienko authored Oct 22, 2024
1 parent 9cc09d8 commit beb9167
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 34 deletions.
16 changes: 10 additions & 6 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
Original file line number Diff line number Diff line change
Expand Up @@ -319,7 +319,8 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
// 1. Local scan on each workgroup
auto __submit_event = __exec.queue().submit([&](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer
auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh);
auto __temp_acc = __result_and_scratch.template __get_scratch_acc<sycl::access_mode::write>(
__cgh, __dpl_sycl::__no_init{});
__dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__kernel_1.get_kernel_bundle());
Expand All @@ -340,7 +341,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
auto __iters_per_single_wg = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, __wgroup_size);
__submit_event = __exec.queue().submit([&](sycl::handler& __cgh) {
__cgh.depends_on(__submit_event);
auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh);
auto __temp_acc = __result_and_scratch.template __get_scratch_acc<sycl::access_mode::read_write>(__cgh);
__dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh);
#if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT
__cgh.use_kernel_bundle(__kernel_2.get_kernel_bundle());
Expand All @@ -362,8 +363,9 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name
auto __final_event = __exec.queue().submit([&](sycl::handler& __cgh) {
__cgh.depends_on(__submit_event);
oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer
auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh);
auto __res_acc = __result_and_scratch.__get_result_acc(__cgh);
auto __temp_acc = __result_and_scratch.template __get_scratch_acc<sycl::access_mode::read>(__cgh);
auto __res_acc = __result_and_scratch.template __get_result_acc<sycl::access_mode::write>(
__cgh, __dpl_sycl::__no_init{});
__cgh.parallel_for<_PropagateScanName...>(sycl::range<1>(__n_groups * __size_per_wg), [=](auto __item) {
auto __temp_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__temp_acc);
auto __res_ptr =
Expand Down Expand Up @@ -579,7 +581,8 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W
// predicate on each element of the input range. The second half stores the index of the output
// range to copy elements of the input range.
auto __lacc = __dpl_sycl::__local_accessor<_ValueType>(sycl::range<1>{__elems_per_wg * 2}, __hdl);
auto __res_acc = __result.__get_result_acc(__hdl);
auto __res_acc =
__result.template __get_result_acc<sycl::access_mode::write>(__hdl, __dpl_sycl::__no_init{});

__hdl.parallel_for<_ScanKernelName...>(
sycl::nd_range<1>(_WGSize, _WGSize), [=](sycl::nd_item<1> __self_item) {
Expand Down Expand Up @@ -1466,7 +1469,8 @@ __parallel_find_or_impl_one_wg(oneapi::dpl::__internal::__device_backend_tag, _E
// main parallel_for
auto __event = __exec.queue().submit([&](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
auto __result_acc = __result_storage.__get_result_acc(__cgh);
auto __result_acc =
__result_storage.template __get_result_acc<sycl::access_mode::write>(__cgh, __dpl_sycl::__no_init{});

__cgh.parallel_for<KernelName>(
sycl::nd_range</*dim=*/1>(sycl::range</*dim=*/1>(__wgroup_size), sycl::range</*dim=*/1>(__wgroup_size)),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -141,7 +141,8 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize,

sycl::event __reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) {
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); // get an access to data under SYCL buffer
auto __res_acc = __scratch_container.__get_result_acc(__cgh);
auto __res_acc =
__scratch_container.template __get_result_acc<sycl::access_mode::write>(__cgh, __dpl_sycl::__no_init{});
std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh);
__cgh.parallel_for<_Name...>(
Expand Down Expand Up @@ -208,7 +209,8 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); // get an access to data under SYCL buffer
std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size);
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh);
auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
auto __temp_acc = __scratch_container.template __get_scratch_acc<sycl::access_mode::write>(
__cgh, __dpl_sycl::__no_init{});
__cgh.parallel_for<_KernelName...>(
sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size), sycl::range<1>(__work_group_size)),
[=](sycl::nd_item<1> __item_id) {
Expand Down Expand Up @@ -253,8 +255,9 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative
__reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) {
__cgh.depends_on(__reduce_event);

auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
auto __res_acc = __scratch_container.__get_result_acc(__cgh);
auto __temp_acc = __scratch_container.template __get_scratch_acc<sycl::access_mode::read>(__cgh);
auto __res_acc =
__scratch_container.template __get_result_acc<sycl::access_mode::write>(__cgh, __dpl_sycl::__no_init{});
__dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__work_group_size), __cgh);

__cgh.parallel_for<_KernelName...>(
Expand Down Expand Up @@ -358,8 +361,10 @@ struct __parallel_transform_reduce_impl
__reduce_event = __exec.queue().submit([&, __is_first, __offset_1, __offset_2, __n,
__n_groups](sycl::handler& __cgh) {
__cgh.depends_on(__reduce_event);
auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
auto __res_acc = __scratch_container.__get_result_acc(__cgh);
auto __temp_acc = __scratch_container.template __get_scratch_acc<sycl::access_mode::read_write>(
__cgh, __is_first ? sycl::property_list{__dpl_sycl::__no_init{}} : sycl::property_list{});
auto __res_acc = __scratch_container.template __get_result_acc<sycl::access_mode::write>(
__cgh, __dpl_sycl::__no_init{});

// get an access to data under SYCL buffer
oneapi::dpl::__ranges::__require_access(__cgh, __rngs...);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -298,7 +298,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu
__dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local, __cgh);
__cgh.depends_on(__prior_event);
oneapi::dpl::__ranges::__require_access(__cgh, __in_rng);
auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
auto __temp_acc = __scratch_container.template __get_scratch_acc<sycl::access_mode::write>(
__cgh, __dpl_sycl::__no_init{});
__cgh.parallel_for<_KernelName...>(
__nd_range, [=, *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
_InitValueType* __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
Expand Down Expand Up @@ -453,8 +454,9 @@ struct __parallel_reduce_then_scan_scan_submitter<
__dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local + 1, __cgh);
__cgh.depends_on(__prior_event);
oneapi::dpl::__ranges::__require_access(__cgh, __in_rng, __out_rng);
auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh);
auto __res_acc = __scratch_container.__get_result_acc(__cgh);
auto __temp_acc = __scratch_container.template __get_scratch_acc<sycl::access_mode::read_write>(__cgh);
auto __res_acc =
__scratch_container.template __get_result_acc<sycl::access_mode::write>(__cgh, __dpl_sycl::__no_init{});

__cgh.parallel_for<_KernelName...>(
__nd_range, [=, *this] (sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
Expand Down
49 changes: 30 additions & 19 deletions include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
Original file line number Diff line number Diff line change
Expand Up @@ -478,31 +478,35 @@ using __repacked_tuple_t = typename __repacked_tuple<T>::type;
template <typename _ContainerOrIterable>
using __value_t = typename __internal::__memobj_traits<_ContainerOrIterable>::value_type;

template <typename _T>
template <typename _Accessor>
struct __usm_or_buffer_accessor
{
private:
using __accessor_t = sycl::accessor<_T, 1, sycl::access::mode::read_write, __dpl_sycl::__target_device,
sycl::access::placeholder::false_t>;
__accessor_t __acc;
using _T = std::decay_t<typename _Accessor::value_type>;
_Accessor __acc;
_T* __ptr = nullptr;
bool __usm = false;
size_t __offset = 0;

public:
// Buffer accessor
__usm_or_buffer_accessor(sycl::handler& __cgh, sycl::buffer<_T, 1>* __sycl_buf)
: __acc(sycl::accessor(*__sycl_buf, __cgh, sycl::read_write, __dpl_sycl::__no_init{}))
__usm_or_buffer_accessor(sycl::handler& __cgh, sycl::buffer<_T, 1>* __sycl_buf,
const sycl::property_list& __prop_list)
: __acc(*__sycl_buf, __cgh, __prop_list)
{
}
__usm_or_buffer_accessor(sycl::handler& __cgh, sycl::buffer<_T, 1>* __sycl_buf, size_t __acc_offset)
: __acc(sycl::accessor(*__sycl_buf, __cgh, sycl::read_write, __dpl_sycl::__no_init{})), __offset(__acc_offset)
__usm_or_buffer_accessor(sycl::handler& __cgh, sycl::buffer<_T, 1>* __sycl_buf, size_t __acc_offset,
const sycl::property_list& __prop_list)
: __acc(*__sycl_buf, __cgh, __prop_list), __offset(__acc_offset)
{
}

// USM pointer
__usm_or_buffer_accessor(sycl::handler& __cgh, _T* __usm_buf) : __ptr(__usm_buf), __usm(true) {}
__usm_or_buffer_accessor(sycl::handler& __cgh, _T* __usm_buf, size_t __ptr_offset)
__usm_or_buffer_accessor(sycl::handler& __cgh, _T* __usm_buf, const sycl::property_list&)
: __ptr(__usm_buf), __usm(true)
{
}
__usm_or_buffer_accessor(sycl::handler& __cgh, _T* __usm_buf, size_t __ptr_offset, const sycl::property_list&)
: __ptr(__usm_buf), __usm(true), __offset(__ptr_offset)
{
}
Expand All @@ -520,6 +524,10 @@ struct __result_and_scratch_storage
private:
using __sycl_buffer_t = sycl::buffer<_T, 1>;

template <sycl::access_mode _AccessMode>
using __accessor_t =
sycl::accessor<_T, 1, _AccessMode, __dpl_sycl::__target_device, sycl::access::placeholder::false_t>;

_ExecutionPolicy __exec;
std::shared_ptr<_T> __scratch_buf;
std::shared_ptr<_T> __result_buf;
Expand Down Expand Up @@ -611,29 +619,32 @@ struct __result_and_scratch_storage
#endif
}

template <sycl::access_mode _AccessMode = sycl::access_mode::read_write>
auto
__get_result_acc(sycl::handler& __cgh) const
__get_result_acc(sycl::handler& __cgh, const sycl::property_list& __prop_list = {}) const
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
if (__use_USM_host && __supports_USM_device)
return __usm_or_buffer_accessor<_T>(__cgh, __result_buf.get());
return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __result_buf.get(), __prop_list);
else if (__supports_USM_device)
return __usm_or_buffer_accessor<_T>(__cgh, __scratch_buf.get(), __scratch_n);
return __usm_or_buffer_accessor<_T>(__cgh, __sycl_buf.get(), __scratch_n);
return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __scratch_buf.get(), __scratch_n,
__prop_list);
return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __sycl_buf.get(), __scratch_n, __prop_list);
#else
return sycl::accessor(*__sycl_buf.get(), __cgh, sycl::read_write, __dpl_sycl::__no_init{});
return __accessor_t<_AccessMode>(*__sycl_buf.get(), __cgh, __prop_list);
#endif
}

template <sycl::access_mode _AccessMode = sycl::access_mode::read_write>
auto
__get_scratch_acc(sycl::handler& __cgh) const
__get_scratch_acc(sycl::handler& __cgh, const sycl::property_list& __prop_list = {}) const
{
#if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT
if (__use_USM_host || __supports_USM_device)
return __usm_or_buffer_accessor<_T>(__cgh, __scratch_buf.get());
return __usm_or_buffer_accessor<_T>(__cgh, __sycl_buf.get());
return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __scratch_buf.get(), __prop_list);
return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __sycl_buf.get(), __prop_list);
#else
return sycl::accessor(*__sycl_buf.get(), __cgh, sycl::read_write, __dpl_sycl::__no_init{});
return __accessor_t<_AccessMode>(*__sycl_buf.get(), __cgh, __prop_list);
#endif
}

Expand Down

0 comments on commit beb9167

Please sign in to comment.