uxlfoundation · danhoeflinger · Jan 23, 2025 · Nov 8, 2024 · Nov 8, 2024 · Nov 8, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -336,7 +336,7 @@ elseif(ONEDPL_BACKEND MATCHES "^(omp)$")
         if (OpenMP_CXX_FLAGS MATCHES ".*-fiopenmp.*")
             set(_openmp_flag -fopenmp)
         elseif (OpenMP_CXX_FLAGS MATCHES ".*[-/]Qiopenmp.*")
-            set(_openmp_flag /Qopenmp)
+            set(_openmp_flag -Qopenmp)
         endif()
         if (_openmp_flag)
             message(STATUS "Using ${_openmp_flag} for openMP")

diff --git a/include/oneapi/dpl/pstl/algorithm_impl.h b/include/oneapi/dpl/pstl/algorithm_impl.h
@@ -43,6 +43,13 @@ namespace dpl
 namespace __internal
 {
 
+template <typename _ValueType, typename... Args>
+__par_backend::__enumerable_thread_local_storage<_ValueType, Args...>
+__make_etls(Args&&... __args)
+{
+    return __par_backend::__enumerable_thread_local_storage<_ValueType, Args...>(std::forward<Args>(__args)...);
+}
+
 //------------------------------------------------------------------------
 // any_of
 //------------------------------------------------------------------------
@@ -4289,6 +4296,88 @@ __pattern_shift_right(_Tag __tag, _ExecutionPolicy&& __exec, _BidirectionalItera
     return __res.base();
 }
 
+template <class _ForwardIterator, class _IdxHashFunc, class _RandomAccessIterator, class _IsVector>
+void
+__brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func,
+                  _RandomAccessIterator __histogram_first, _IsVector) noexcept
+{
+    for (; __first != __last; ++__first)
+    {
+        std::int32_t __bin = __func.get_bin(*__first);
+        if (__bin >= 0)
+        {
+            ++__histogram_first[__bin];
+        }
+    }
+}
+
+template <class _Tag, class _ExecutionPolicy, class _ForwardIterator, class _Size, class _IdxHashFunc,
+          class _RandomAccessIterator>
+void
+__pattern_histogram(_Tag, _ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last,
+                    _Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator __histogram_first)
+{
+    using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator>::value_type;
+    static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
+    __pattern_fill(_Tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins,
+                   _HistogramValueT{0});
+    __brick_histogram(__first, __last, __func, __histogram_first, typename _Tag::__is_vector{});
+}
+
+template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _Size, class _IdxHashFunc,
+          class _RandomAccessIterator2>
+void
+__pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first,
+                    _RandomAccessIterator1 __last, _Size __num_bins, _IdxHashFunc __func,
+                    _RandomAccessIterator2 __histogram_first)
+{
+    using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag;
+    using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator2>::value_type;
+    using _DiffType = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
+
+    _DiffType __n = __last - __first;
+    if (__n == 0)
+    {
+        // when n == 0, we must fill the output histogram with zeros
+        __pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first,
+                       __histogram_first + __num_bins, _HistogramValueT{0});
+    }
+    else
+    {
+        auto __tls =
+            oneapi::dpl::__internal::__make_etls<std::vector<_HistogramValueT>>(__num_bins, _HistogramValueT{0});
+
+        //main histogram loop
+        //TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead
+        __par_backend::__parallel_for(
+            __backend_tag{}, __exec, __first, __last,
+            [__func, &__tls](_RandomAccessIterator1 __first_local, _RandomAccessIterator1 __last_local) {
+                __internal::__brick_histogram(__first_local, __last_local, __func,
+                                              __tls.get_for_current_thread().begin(), _IsVector{});
+            });
+        // now accumulate temporary storage into output global histogram
+        const std::size_t __num_temporary_copies = __tls.size();
+        __par_backend::__parallel_for(
+            __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins,
+            [__num_temporary_copies, __histogram_first, &__tls](auto __global_histogram_first,
+                                                                auto __global_histogram_last) {
+                const _DiffType __local_n = __global_histogram_last - __global_histogram_first;
+                const _DiffType __range_begin_id = __global_histogram_first - __histogram_first;
+                //initialize output global histogram with first local histogram via assign
+                __internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __range_begin_id, __local_n,
+                                            __global_histogram_first, oneapi::dpl::__internal::__pstl_assign(),
+                                            _IsVector{});
+                for (std::size_t __i = 1; __i < __num_temporary_copies; ++__i)
+                {
+                    //accumulate into output global histogram with other local histogram via += operator
+                    __internal::__brick_walk2_n(
+                        __tls.get_with_id(__i).begin() + __range_begin_id, __local_n, __global_histogram_first,
+                        [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{});
+                }
+            });
+    }
+}
+
 } // namespace __internal
 } // namespace dpl
 } // namespace oneapi

diff --git a/include/oneapi/dpl/pstl/histogram_impl.h b/include/oneapi/dpl/pstl/histogram_impl.h
@@ -19,6 +19,7 @@
 #include "histogram_extension_defs.h"
 #include "histogram_binhash_utils.h"
 #include "iterator_impl.h"
+#include "algorithm_impl.h"
 
 #if _ONEDPL_HETERO_BACKEND
 #    include "hetero/histogram_impl_hetero.h"
@@ -29,23 +30,6 @@ namespace oneapi
 namespace dpl
 {
 
-namespace __internal
-{
-
-template <class _Tag, typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _Size, typename _IdxHashFunc,
-          typename _RandomAccessIterator2>
-void
-__pattern_histogram(_Tag, _ExecutionPolicy&& exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last,
-                    _Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator2 __histogram_first)
-{
-    static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
-
-    static_assert(sizeof(_Size) == 0 /*false*/,
-                  "Histogram API is currently unsupported for policies other than device execution policies");
-}
-
-} // namespace __internal
-
 template <typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _Size, typename _ValueType,
           typename _RandomAccessIterator2>
 oneapi::dpl::__internal::__enable_if_execution_policy<_ExecutionPolicy, _RandomAccessIterator2>

diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h
@@ -20,11 +20,13 @@
 #include <atomic>
 #include <iterator>
 #include <cstddef>
+#include <cstdint>
 #include <cstdio>
 #include <memory>
 #include <vector>
 #include <type_traits>
 #include <omp.h>
+#include <tuple>
 
 #include "../parallel_backend_utils.h"
 #include "../unseq_backend_simd.h"
@@ -153,6 +155,70 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu
     __f(__first, __last);
 }
 
+template <typename _ValueType, typename... _Args>
+struct __enumerable_thread_local_storage
+{
+    template <typename... _LocalArgs>
+    __enumerable_thread_local_storage(_LocalArgs&&... __args)
+        : __num_elements(0), __args(std::forward<_LocalArgs>(__args)...)
+    {
+        std::size_t __num_threads = omp_in_parallel() ? omp_get_num_threads() : omp_get_max_threads();
+        __thread_specific_storage.resize(__num_threads);
+    }
+
+    // Note: Size should not be used concurrently with parallel loops which may instantiate storage objects, as it may
+    // not return an accurate count of instantiated storage objects in lockstep with the number allocated and stored.
+    // This is because the count is not atomic with the allocation and storage of the storage objects.
+    std::size_t
+    size() const
+    {
+        // only count storage which has been instantiated
+        return __num_elements.load();
+    }
+
+    _ValueType&
+    get_with_id(std::size_t __i)
+    {
+        assert(__i < size());
+
+        std::size_t __j = 0;
+
+        if (size() == __thread_specific_storage.size())
+        {
+            return *__thread_specific_storage[__i];
+        }
+
+        for (std::size_t __count = 0; __j < __thread_specific_storage.size() && __count <= __i; ++__j)
+        {
+            // Only include storage from threads which have instantiated a storage object
+            if (__thread_specific_storage[__j])
+            {
+                __count++;
+            }
+        }
+        // Need to back up one once we have found a valid storage object
+        return *__thread_specific_storage[__j - 1];
+    }
+
+    _ValueType&
+    get_for_current_thread()
+    {
+        std::size_t __i = omp_get_thread_num();
+        if (!__thread_specific_storage[__i])
+        {
+            // create temporary storage on first usage to avoid extra parallel region and unnecessary instantiation
+            __thread_specific_storage[__i] =
+                std::apply([](_Args... __arg_pack) { return std::make_unique<_ValueType>(__arg_pack...); }, __args);
+            __num_elements.fetch_add(1);
+        }
+        return *__thread_specific_storage[__i];
+    }
+
+    std::vector<std::unique_ptr<_ValueType>> __thread_specific_storage;
+    std::atomic_size_t __num_elements;
+    std::tuple<_Args...> __args;
+};
+
 } // namespace __omp_backend
 } // namespace dpl
 } // namespace oneapi

diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h
@@ -20,11 +20,11 @@
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <numeric>
 #include <utility>
 #include <type_traits>
-
 #include "parallel_backend_utils.h"
 
 namespace oneapi
@@ -42,6 +42,35 @@ __cancel_execution(oneapi::dpl::__internal::__serial_backend_tag)
 {
 }
 
+template <typename _ValueType, typename... _Args>
+struct __enumerable_thread_local_storage
+{
+    template <typename... _LocalArgs>
+    __enumerable_thread_local_storage(_LocalArgs&&... __args) : __storage(std::forward<_LocalArgs>(__args)...)
+    {
+    }
+
+    std::size_t
+    size() const
+    {
+        return std::size_t{1};
+    }
+
+    _ValueType&
+    get_for_current_thread()
+    {
+        return __storage;
+    }
+
+    _ValueType&
+    get_with_id(std::size_t /*__i*/)
+    {
+        return get_for_current_thread();
+    }
+
+    _ValueType __storage;
+};
+
 template <class _ExecutionPolicy, class _Index, class _Fp>
 void
 __parallel_for(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last,

diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h
@@ -34,6 +34,7 @@
 #include <tbb/parallel_invoke.h>
 #include <tbb/task_arena.h>
 #include <tbb/tbb_allocator.h>
+#include <tbb/enumerable_thread_specific.h>
 #if TBB_INTERFACE_VERSION > 12000
 #    include <tbb/task.h>
 #endif
@@ -1306,6 +1307,36 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy
     tbb::this_task_arena::isolate([&]() { tbb::parallel_for_each(__begin, __end, __f); });
 }
 
+template <typename _ValueType, typename... _Args>
+struct __enumerable_thread_local_storage
+{
+    template <typename... _LocalArgs>
+    __enumerable_thread_local_storage(_LocalArgs&&... __args)
+        : __thread_specific_storage(std::forward<_LocalArgs>(__args)...)
+    {
+    }
+
+    std::size_t
+    size() const
+    {
+        return __thread_specific_storage.size();
+    }
+
+    _ValueType&
+    get_for_current_thread()
+    {
+        return __thread_specific_storage.local();
+    }
+
+    _ValueType&
+    get_with_id(std::size_t __i)
+    {
+        return __thread_specific_storage.begin()[__i];
+    }
+
+    tbb::enumerable_thread_specific<_ValueType> __thread_specific_storage;
+};
+
 } // namespace __tbb_backend
 } // namespace dpl
 } // namespace oneapi