diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index 93a5c8c2496..0570b8b4d85 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -319,7 +319,8 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name // 1. Local scan on each workgroup auto __submit_event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer - auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh); + auto __temp_acc = __result_and_scratch.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); __dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh); #if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT __cgh.use_kernel_bundle(__kernel_1.get_kernel_bundle()); @@ -340,7 +341,7 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name auto __iters_per_single_wg = oneapi::dpl::__internal::__dpl_ceiling_div(__n_groups, __wgroup_size); __submit_event = __exec.queue().submit([&](sycl::handler& __cgh) { __cgh.depends_on(__submit_event); - auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh); + auto __temp_acc = __result_and_scratch.template __get_scratch_acc(__cgh); __dpl_sycl::__local_accessor<_Type> __local_acc(__wgroup_size, __cgh); #if _ONEDPL_COMPILE_KERNEL && _ONEDPL_KERNEL_BUNDLE_PRESENT __cgh.use_kernel_bundle(__kernel_2.get_kernel_bundle()); @@ -362,8 +363,9 @@ struct __parallel_scan_submitter<_CustomName, __internal::__optional_kernel_name auto __final_event = __exec.queue().submit([&](sycl::handler& __cgh) { __cgh.depends_on(__submit_event); oneapi::dpl::__ranges::__require_access(__cgh, __rng1, __rng2); //get an access to data under SYCL buffer - auto __temp_acc = __result_and_scratch.__get_scratch_acc(__cgh); - auto __res_acc = __result_and_scratch.__get_result_acc(__cgh); + auto __temp_acc = __result_and_scratch.template __get_scratch_acc(__cgh); + auto __res_acc = __result_and_scratch.template __get_result_acc( + __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_PropagateScanName...>(sycl::range<1>(__n_groups * __size_per_wg), [=](auto __item) { auto __temp_ptr = __result_and_scratch_storage_t::__get_usm_or_buffer_accessor_ptr(__temp_acc); auto __res_ptr = @@ -579,7 +581,8 @@ struct __parallel_copy_if_static_single_group_submitter<_Size, _ElemsPerItem, _W // predicate on each element of the input range. The second half stores the index of the output // range to copy elements of the input range. auto __lacc = __dpl_sycl::__local_accessor<_ValueType>(sycl::range<1>{__elems_per_wg * 2}, __hdl); - auto __res_acc = __result.__get_result_acc(__hdl); + auto __res_acc = + __result.template __get_result_acc(__hdl, __dpl_sycl::__no_init{}); __hdl.parallel_for<_ScanKernelName...>( sycl::nd_range<1>(_WGSize, _WGSize), [=](sycl::nd_item<1> __self_item) { @@ -1466,7 +1469,8 @@ __parallel_find_or_impl_one_wg(oneapi::dpl::__internal::__device_backend_tag, _E // main parallel_for auto __event = __exec.queue().submit([&](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); - auto __result_acc = __result_storage.__get_result_acc(__cgh); + auto __result_acc = + __result_storage.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for( sycl::nd_range(sycl::range(__wgroup_size), sycl::range(__wgroup_size)), diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h index ca776e94dce..edad63d2a79 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce.h @@ -141,7 +141,8 @@ struct __parallel_transform_reduce_small_submitter<_Tp, _Commutative, _VecSize, sycl::event __reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) { oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); // get an access to data under SYCL buffer - auto __res_acc = __scratch_container.__get_result_acc(__cgh); + auto __res_acc = + __scratch_container.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size); __dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh); __cgh.parallel_for<_Name...>( @@ -208,7 +209,8 @@ struct __parallel_transform_reduce_device_kernel_submitter<_Tp, _Commutative, _V oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); // get an access to data under SYCL buffer std::size_t __local_mem_size = __reduce_pattern.local_mem_req(__work_group_size); __dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__local_mem_size), __cgh); - auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh); + auto __temp_acc = __scratch_container.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_KernelName...>( sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size), sycl::range<1>(__work_group_size)), [=](sycl::nd_item<1> __item_id) { @@ -253,8 +255,9 @@ struct __parallel_transform_reduce_work_group_kernel_submitter<_Tp, _Commutative __reduce_event = __exec.queue().submit([&, __n](sycl::handler& __cgh) { __cgh.depends_on(__reduce_event); - auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh); - auto __res_acc = __scratch_container.__get_result_acc(__cgh); + auto __temp_acc = __scratch_container.template __get_scratch_acc(__cgh); + auto __res_acc = + __scratch_container.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); __dpl_sycl::__local_accessor<_Tp> __temp_local(sycl::range<1>(__work_group_size), __cgh); __cgh.parallel_for<_KernelName...>( @@ -358,8 +361,10 @@ struct __parallel_transform_reduce_impl __reduce_event = __exec.queue().submit([&, __is_first, __offset_1, __offset_2, __n, __n_groups](sycl::handler& __cgh) { __cgh.depends_on(__reduce_event); - auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh); - auto __res_acc = __scratch_container.__get_result_acc(__cgh); + auto __temp_acc = __scratch_container.template __get_scratch_acc( + __cgh, __is_first ? sycl::property_list{__dpl_sycl::__no_init{}} : sycl::property_list{}); + auto __res_acc = __scratch_container.template __get_result_acc( + __cgh, __dpl_sycl::__no_init{}); // get an access to data under SYCL buffer oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index 0856234985f..8c0762f2a38 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -298,7 +298,8 @@ struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inpu __dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local, __cgh); __cgh.depends_on(__prior_event); oneapi::dpl::__ranges::__require_access(__cgh, __in_rng); - auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh); + auto __temp_acc = __scratch_container.template __get_scratch_acc( + __cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_KernelName...>( __nd_range, [=, *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { _InitValueType* __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc); @@ -453,8 +454,9 @@ struct __parallel_reduce_then_scan_scan_submitter< __dpl_sycl::__local_accessor<_InitValueType> __sub_group_partials(__num_sub_groups_local + 1, __cgh); __cgh.depends_on(__prior_event); oneapi::dpl::__ranges::__require_access(__cgh, __in_rng, __out_rng); - auto __temp_acc = __scratch_container.__get_scratch_acc(__cgh); - auto __res_acc = __scratch_container.__get_result_acc(__cgh); + auto __temp_acc = __scratch_container.template __get_scratch_acc(__cgh); + auto __res_acc = + __scratch_container.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); __cgh.parallel_for<_KernelName...>( __nd_range, [=, *this] (sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 9bd195a80a9..e7fbfb7ae7c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -478,31 +478,35 @@ using __repacked_tuple_t = typename __repacked_tuple::type; template using __value_t = typename __internal::__memobj_traits<_ContainerOrIterable>::value_type; -template +template struct __usm_or_buffer_accessor { private: - using __accessor_t = sycl::accessor<_T, 1, sycl::access::mode::read_write, __dpl_sycl::__target_device, - sycl::access::placeholder::false_t>; - __accessor_t __acc; + using _T = std::decay_t; + _Accessor __acc; _T* __ptr = nullptr; bool __usm = false; size_t __offset = 0; public: // Buffer accessor - __usm_or_buffer_accessor(sycl::handler& __cgh, sycl::buffer<_T, 1>* __sycl_buf) - : __acc(sycl::accessor(*__sycl_buf, __cgh, sycl::read_write, __dpl_sycl::__no_init{})) + __usm_or_buffer_accessor(sycl::handler& __cgh, sycl::buffer<_T, 1>* __sycl_buf, + const sycl::property_list& __prop_list) + : __acc(*__sycl_buf, __cgh, __prop_list) { } - __usm_or_buffer_accessor(sycl::handler& __cgh, sycl::buffer<_T, 1>* __sycl_buf, size_t __acc_offset) - : __acc(sycl::accessor(*__sycl_buf, __cgh, sycl::read_write, __dpl_sycl::__no_init{})), __offset(__acc_offset) + __usm_or_buffer_accessor(sycl::handler& __cgh, sycl::buffer<_T, 1>* __sycl_buf, size_t __acc_offset, + const sycl::property_list& __prop_list) + : __acc(*__sycl_buf, __cgh, __prop_list), __offset(__acc_offset) { } // USM pointer - __usm_or_buffer_accessor(sycl::handler& __cgh, _T* __usm_buf) : __ptr(__usm_buf), __usm(true) {} - __usm_or_buffer_accessor(sycl::handler& __cgh, _T* __usm_buf, size_t __ptr_offset) + __usm_or_buffer_accessor(sycl::handler& __cgh, _T* __usm_buf, const sycl::property_list&) + : __ptr(__usm_buf), __usm(true) + { + } + __usm_or_buffer_accessor(sycl::handler& __cgh, _T* __usm_buf, size_t __ptr_offset, const sycl::property_list&) : __ptr(__usm_buf), __usm(true), __offset(__ptr_offset) { } @@ -520,6 +524,10 @@ struct __result_and_scratch_storage private: using __sycl_buffer_t = sycl::buffer<_T, 1>; + template + using __accessor_t = + sycl::accessor<_T, 1, _AccessMode, __dpl_sycl::__target_device, sycl::access::placeholder::false_t>; + _ExecutionPolicy __exec; std::shared_ptr<_T> __scratch_buf; std::shared_ptr<_T> __result_buf; @@ -611,29 +619,32 @@ struct __result_and_scratch_storage #endif } + template auto - __get_result_acc(sycl::handler& __cgh) const + __get_result_acc(sycl::handler& __cgh, const sycl::property_list& __prop_list = {}) const { #if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT if (__use_USM_host && __supports_USM_device) - return __usm_or_buffer_accessor<_T>(__cgh, __result_buf.get()); + return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __result_buf.get(), __prop_list); else if (__supports_USM_device) - return __usm_or_buffer_accessor<_T>(__cgh, __scratch_buf.get(), __scratch_n); - return __usm_or_buffer_accessor<_T>(__cgh, __sycl_buf.get(), __scratch_n); + return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __scratch_buf.get(), __scratch_n, + __prop_list); + return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __sycl_buf.get(), __scratch_n, __prop_list); #else - return sycl::accessor(*__sycl_buf.get(), __cgh, sycl::read_write, __dpl_sycl::__no_init{}); + return __accessor_t<_AccessMode>(*__sycl_buf.get(), __cgh, __prop_list); #endif } + template auto - __get_scratch_acc(sycl::handler& __cgh) const + __get_scratch_acc(sycl::handler& __cgh, const sycl::property_list& __prop_list = {}) const { #if _ONEDPL_SYCL_UNIFIED_USM_BUFFER_PRESENT if (__use_USM_host || __supports_USM_device) - return __usm_or_buffer_accessor<_T>(__cgh, __scratch_buf.get()); - return __usm_or_buffer_accessor<_T>(__cgh, __sycl_buf.get()); + return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __scratch_buf.get(), __prop_list); + return __usm_or_buffer_accessor<__accessor_t<_AccessMode>>(__cgh, __sycl_buf.get(), __prop_list); #else - return sycl::accessor(*__sycl_buf.get(), __cgh, sycl::read_write, __dpl_sycl::__no_init{}); + return __accessor_t<_AccessMode>(*__sycl_buf.get(), __cgh, __prop_list); #endif }