Host Implementation of Histogram APIs (#1974)

Signed-off-by: Dan Hoeflinger <[email protected]>
uxlfoundation · Jan 23, 2025 · 780705c · 780705c
1 parent b619105
commit 780705c
Show file tree

Hide file tree

Showing 6 changed files with 275 additions and 21 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -336,7 +336,7 @@ elseif(ONEDPL_BACKEND MATCHES "^(omp)$")
         if (OpenMP_CXX_FLAGS MATCHES ".*-fiopenmp.*")
             set(_openmp_flag -fopenmp)
         elseif (OpenMP_CXX_FLAGS MATCHES ".*[-/]Qiopenmp.*")
-            set(_openmp_flag /Qopenmp)
+            set(_openmp_flag -Qopenmp)
         endif()
         if (_openmp_flag)
             message(STATUS "Using ${_openmp_flag} for openMP")

diff --git a/include/oneapi/dpl/pstl/histogram_impl.h b/include/oneapi/dpl/pstl/histogram_impl.h
@@ -18,7 +18,9 @@
 
 #include "histogram_extension_defs.h"
 #include "histogram_binhash_utils.h"
+#include "execution_impl.h"
 #include "iterator_impl.h"
+#include "algorithm_fwd.h"
 
 #if _ONEDPL_HETERO_BACKEND
 #    include "hetero/histogram_impl_hetero.h"
@@ -32,16 +34,86 @@ namespace dpl
 namespace __internal
 {
 
-template <class _Tag, typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _Size, typename _IdxHashFunc,
-          typename _RandomAccessIterator2>
+template <class _ForwardIterator, class _IdxHashFunc, class _RandomAccessIterator, class _IsVector>
+void
+__brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func,
+                  _RandomAccessIterator __histogram_first, _IsVector) noexcept
+{
+    for (; __first != __last; ++__first)
+    {
+        std::int32_t __bin = __func.get_bin(*__first);
+        if (__bin >= 0)
+        {
+            ++__histogram_first[__bin];
+        }
+    }
+}
+
+template <class _Tag, class _ExecutionPolicy, class _ForwardIterator, class _Size, class _IdxHashFunc,
+          class _RandomAccessIterator>
 void
-__pattern_histogram(_Tag, _ExecutionPolicy&& exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last,
-                    _Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator2 __histogram_first)
+__pattern_histogram(_Tag, _ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last,
+                    _Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator __histogram_first)
 {
-    static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
+    using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator>::value_type;
+    static_assert(oneapi::dpl::__internal::__is_serial_tag_v<_Tag> ||
+                  oneapi::dpl::__internal::__is_parallel_forward_tag_v<_Tag>);
+    __pattern_fill(_Tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins,
+                   _HistogramValueT{0});
+    __brick_histogram(__first, __last, __func, __histogram_first, typename _Tag::__is_vector{});
+}
+
+template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _Size, class _IdxHashFunc,
+          class _RandomAccessIterator2>
+void
+__pattern_histogram(oneapi::dpl::__internal::__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec,
+                    _RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins,
+                    _IdxHashFunc __func, _RandomAccessIterator2 __histogram_first)
+{
+    using __backend_tag = typename oneapi::dpl::__internal::__parallel_tag<_IsVector>::__backend_tag;
+    using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator2>::value_type;
+    using _DiffType = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;
+
+    _DiffType __n = __last - __first;
+    if (__n == 0)
+    {
+        // when n == 0, we must fill the output histogram with zeros
+        __pattern_fill(oneapi::dpl::__internal::__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec),
+                       __histogram_first, __histogram_first + __num_bins, _HistogramValueT{0});
+    }
+    else
+    {
+        auto __tls =
+            __par_backend::__make_enumerable_tls<std::vector<_HistogramValueT>>(__num_bins, _HistogramValueT{0});
 
-    static_assert(sizeof(_Size) == 0 /*false*/,
-                  "Histogram API is currently unsupported for policies other than device execution policies");
+        //main histogram loop
+        //TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead
+        __par_backend::__parallel_for(
+            __backend_tag{}, __exec, __first, __last,
+            [__func, &__tls](_RandomAccessIterator1 __first_local, _RandomAccessIterator1 __last_local) {
+                __internal::__brick_histogram(__first_local, __last_local, __func,
+                                              __tls.get_for_current_thread().begin(), _IsVector{});
+            });
+        // now accumulate temporary storage into output histogram
+        const std::size_t __num_temporary_copies = __tls.size();
+        __par_backend::__parallel_for(
+            __backend_tag{}, std::forward<_ExecutionPolicy>(__exec), _Size{0}, __num_bins,
+            [__num_temporary_copies, __histogram_first, &__tls](auto __hist_start_id, auto __hist_end_id) {
+                const _DiffType __local_n = __hist_end_id - __hist_start_id;
+                //initialize output histogram with first local histogram via assign
+                __internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __hist_start_id, __local_n,
+                                            __histogram_first + __hist_start_id,
+                                            oneapi::dpl::__internal::__pstl_assign(), _IsVector{});
+                for (std::size_t __i = 1; __i < __num_temporary_copies; ++__i)
+                {
+                    //accumulate into output histogram with other local histogram via += operator
+                    __internal::__brick_walk2_n(
+                        __tls.get_with_id(__i).begin() + __hist_start_id, __local_n,
+                        __histogram_first + __hist_start_id,
+                        [](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{});
+                }
+            });
+    }
 }
 
 } // namespace __internal

diff --git a/include/oneapi/dpl/pstl/omp/util.h b/include/oneapi/dpl/pstl/omp/util.h
@@ -20,11 +20,13 @@
 #include <atomic>
 #include <iterator>
 #include <cstddef>
+#include <cstdint>
 #include <cstdio>
 #include <memory>
 #include <vector>
 #include <type_traits>
 #include <omp.h>
+#include <tuple>
 
 #include "../parallel_backend_utils.h"
 #include "../unseq_backend_simd.h"
@@ -153,6 +155,87 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu
     __f(__first, __last);
 }
 
+namespace __detail
+{
+
+template <typename _ValueType, typename... _Args>
+struct __enumerable_thread_local_storage
+{
+    template <typename... _LocalArgs>
+    __enumerable_thread_local_storage(_LocalArgs&&... __args)
+        : __num_elements(0), __args(std::forward<_LocalArgs>(__args)...)
+    {
+        std::size_t __num_threads = omp_in_parallel() ? omp_get_num_threads() : omp_get_max_threads();
+        __thread_specific_storage.resize(__num_threads);
+    }
+
+    // Note: size should not be used concurrently with parallel loops which may instantiate storage objects, as it may
+    // not return an accurate count of instantiated storage objects in lockstep with the number allocated and stored.
+    // This is because the count is not atomic with the allocation and storage of the storage objects.
+    std::size_t
+    size() const
+    {
+        // only count storage which has been instantiated
+        return __num_elements.load();
+    }
+
+    // Note: get_with_id should not be used concurrently with parallel loops which may instantiate storage objects,
+    // as its operation may provide an out of date view of the stored objects based on the timing new object creation
+    // and incrementing of the size.
+    // TODO: Consider replacing this access with a visitor pattern.
+    _ValueType&
+    get_with_id(std::size_t __i)
+    {
+        assert(__i < size());
+
+        std::size_t __j = 0;
+
+        if (size() == __thread_specific_storage.size())
+        {
+            return *__thread_specific_storage[__i];
+        }
+
+        for (std::size_t __count = 0; __j < __thread_specific_storage.size() && __count <= __i; ++__j)
+        {
+            // Only include storage from threads which have instantiated a storage object
+            if (__thread_specific_storage[__j])
+            {
+                ++__count;
+            }
+        }
+        // Need to back up one once we have found a valid storage object
+        return *__thread_specific_storage[__j - 1];
+    }
+
+    _ValueType&
+    get_for_current_thread()
+    {
+        std::size_t __i = omp_get_thread_num();
+        if (!__thread_specific_storage[__i])
+        {
+            // create temporary storage on first usage to avoid extra parallel region and unnecessary instantiation
+            __thread_specific_storage[__i] =
+                std::apply([](_Args... __arg_pack) { return std::make_unique<_ValueType>(__arg_pack...); }, __args);
+            __num_elements.fetch_add(1);
+        }
+        return *__thread_specific_storage[__i];
+    }
+
+    std::vector<std::unique_ptr<_ValueType>> __thread_specific_storage;
+    std::atomic_size_t __num_elements;
+    std::tuple<_Args...> __args;
+};
+
+} // namespace __detail
+
+// enumerable thread local storage should only be created from make function
+template <typename _ValueType, typename... Args>
+__detail::__enumerable_thread_local_storage<_ValueType, Args...>
+__make_enumerable_tls(Args&&... __args)
+{
+    return __detail::__enumerable_thread_local_storage<_ValueType, Args...>(std::forward<Args>(__args)...);
+}
+
 } // namespace __omp_backend
 } // namespace dpl
 } // namespace oneapi

diff --git a/include/oneapi/dpl/pstl/parallel_backend_serial.h b/include/oneapi/dpl/pstl/parallel_backend_serial.h
@@ -20,11 +20,11 @@
 
 #include <algorithm>
 #include <cstddef>
+#include <cstdint>
 #include <memory>
 #include <numeric>
 #include <utility>
 #include <type_traits>
-
 #include "parallel_backend_utils.h"
 
 namespace oneapi
@@ -34,6 +34,48 @@ namespace dpl
 namespace __serial_backend
 {
 
+namespace __detail
+{
+
+template <typename _ValueType>
+struct __enumerable_thread_local_storage
+{
+    template <typename... _LocalArgs>
+    __enumerable_thread_local_storage(_LocalArgs&&... __args) : __storage(std::forward<_LocalArgs>(__args)...)
+    {
+    }
+
+    std::size_t
+    size() const
+    {
+        return std::size_t{1};
+    }
+
+    _ValueType&
+    get_for_current_thread()
+    {
+        return __storage;
+    }
+
+    _ValueType&
+    get_with_id(std::size_t /*__i*/)
+    {
+        return get_for_current_thread();
+    }
+
+    _ValueType __storage;
+};
+
+} //namespace __detail
+
+// enumerable thread local storage should only be created from make function
+template <typename _ValueType, typename... Args>
+__detail::__enumerable_thread_local_storage<_ValueType>
+__make_enumerable_tls(Args&&... __args)
+{
+    return __detail::__enumerable_thread_local_storage<_ValueType>(std::forward<Args>(__args)...);
+}
+
 template <typename _ExecutionPolicy, typename _Tp>
 using __buffer = oneapi::dpl::__utils::__buffer_impl<std::decay_t<_ExecutionPolicy>, _Tp, std::allocator>;
 

diff --git a/include/oneapi/dpl/pstl/parallel_backend_tbb.h b/include/oneapi/dpl/pstl/parallel_backend_tbb.h
@@ -34,6 +34,7 @@
 #include <tbb/parallel_invoke.h>
 #include <tbb/task_arena.h>
 #include <tbb/tbb_allocator.h>
+#include <tbb/enumerable_thread_specific.h>
 #if TBB_INTERFACE_VERSION > 12000
 #    include <tbb/task.h>
 #endif
@@ -1306,6 +1307,49 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy
     tbb::this_task_arena::isolate([&]() { tbb::parallel_for_each(__begin, __end, __f); });
 }
 
+namespace __detail
+{
+
+template <typename _ValueType>
+struct __enumerable_thread_local_storage
+{
+    template <typename... _LocalArgs>
+    __enumerable_thread_local_storage(_LocalArgs&&... __args)
+        : __thread_specific_storage(std::forward<_LocalArgs>(__args)...)
+    {
+    }
+
+    std::size_t
+    size() const
+    {
+        return __thread_specific_storage.size();
+    }
+
+    _ValueType&
+    get_for_current_thread()
+    {
+        return __thread_specific_storage.local();
+    }
+
+    _ValueType&
+    get_with_id(std::size_t __i)
+    {
+        return __thread_specific_storage.begin()[__i];
+    }
+
+    tbb::enumerable_thread_specific<_ValueType> __thread_specific_storage;
+};
+
+} // namespace __detail
+
+// enumerable thread local storage should only be created from make function
+template <typename _ValueType, typename... Args>
+__detail::__enumerable_thread_local_storage<_ValueType>
+__make_enumerable_tls(Args&&... __args)
+{
+    return __detail::__enumerable_thread_local_storage<_ValueType>(std::forward<Args>(__args)...);
+}
+
 } // namespace __tbb_backend
 } // namespace dpl
 } // namespace oneapi