Skip to content

Commit

Permalink
Host Implementation of Histogram APIs (#1974)
Browse files Browse the repository at this point in the history
Signed-off-by: Dan Hoeflinger <[email protected]>
  • Loading branch information
danhoeflinger authored Jan 23, 2025
1 parent b619105 commit 780705c
Show file tree
Hide file tree
Showing 6 changed files with 275 additions and 21 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ elseif(ONEDPL_BACKEND MATCHES "^(omp)$")
if (OpenMP_CXX_FLAGS MATCHES ".*-fiopenmp.*")
set(_openmp_flag -fopenmp)
elseif (OpenMP_CXX_FLAGS MATCHES ".*[-/]Qiopenmp.*")
set(_openmp_flag /Qopenmp)
set(_openmp_flag -Qopenmp)
endif()
if (_openmp_flag)
message(STATUS "Using ${_openmp_flag} for openMP")
Expand Down
86 changes: 79 additions & 7 deletions include/oneapi/dpl/pstl/histogram_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,9 @@

#include "histogram_extension_defs.h"
#include "histogram_binhash_utils.h"
#include "execution_impl.h"
#include "iterator_impl.h"
#include "algorithm_fwd.h"

#if _ONEDPL_HETERO_BACKEND
# include "hetero/histogram_impl_hetero.h"
Expand All @@ -32,16 +34,86 @@ namespace dpl
namespace __internal
{

template <class _Tag, typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _Size, typename _IdxHashFunc,
typename _RandomAccessIterator2>
template <class _ForwardIterator, class _IdxHashFunc, class _RandomAccessIterator, class _IsVector>
void
__brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func,
_RandomAccessIterator __histogram_first, _IsVector) noexcept
{
for (; __first != __last; ++__first)
{
std::int32_t __bin = __func.get_bin(*__first);
if (__bin >= 0)
{
++__histogram_first[__bin];
}
}
}

template <class _Tag, class _ExecutionPolicy, class _ForwardIterator, class _Size, class _IdxHashFunc,
class _RandomAccessIterator>
void
__pattern_histogram(_Tag, _ExecutionPolicy&& exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last,
_Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator2 __histogram_first)
__pattern_histogram(_Tag, _ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last,
_Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator __histogram_first)
{
static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator>::value_type;
static_assert(oneapi::dpl::__internal::__is_serial_tag_v<_Tag> ||
oneapi::dpl::__internal::__is_parallel_forward_tag_v<_Tag>);
__pattern_fill(_Tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins,
_HistogramValueT{0});
__brick_histogram(__first, __last, __func, __histogram_first, typename _Tag::__is_vector{});
}

template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _Size, class _IdxHashFunc,
class _RandomAccessIterator2>
void
__pattern_histogram(oneapi::dpl::__internal::__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec,
_RandomAccessIterator1 __first, _RandomAccessIterator1 __last, _Size __num_bins,
_IdxHashFunc __func, _RandomAccessIterator2 __histogram_first)
{
using __backend_tag = typename oneapi::dpl::__internal::__parallel_tag<_IsVector>::__backend_tag;
using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator2>::value_type;
using _DiffType = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;

_DiffType __n = __last - __first;
if (__n == 0)
{
// when n == 0, we must fill the output histogram with zeros
__pattern_fill(oneapi::dpl::__internal::__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec),
__histogram_first, __histogram_first + __num_bins, _HistogramValueT{0});
}
else
{
auto __tls =
__par_backend::__make_enumerable_tls<std::vector<_HistogramValueT>>(__num_bins, _HistogramValueT{0});

static_assert(sizeof(_Size) == 0 /*false*/,
"Histogram API is currently unsupported for policies other than device execution policies");
//main histogram loop
//TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead
__par_backend::__parallel_for(
__backend_tag{}, __exec, __first, __last,
[__func, &__tls](_RandomAccessIterator1 __first_local, _RandomAccessIterator1 __last_local) {
__internal::__brick_histogram(__first_local, __last_local, __func,
__tls.get_for_current_thread().begin(), _IsVector{});
});
// now accumulate temporary storage into output histogram
const std::size_t __num_temporary_copies = __tls.size();
__par_backend::__parallel_for(
__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), _Size{0}, __num_bins,
[__num_temporary_copies, __histogram_first, &__tls](auto __hist_start_id, auto __hist_end_id) {
const _DiffType __local_n = __hist_end_id - __hist_start_id;
//initialize output histogram with first local histogram via assign
__internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __hist_start_id, __local_n,
__histogram_first + __hist_start_id,
oneapi::dpl::__internal::__pstl_assign(), _IsVector{});
for (std::size_t __i = 1; __i < __num_temporary_copies; ++__i)
{
//accumulate into output histogram with other local histogram via += operator
__internal::__brick_walk2_n(
__tls.get_with_id(__i).begin() + __hist_start_id, __local_n,
__histogram_first + __hist_start_id,
[](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{});
}
});
}
}

} // namespace __internal
Expand Down
83 changes: 83 additions & 0 deletions include/oneapi/dpl/pstl/omp/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@
#include <atomic>
#include <iterator>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <memory>
#include <vector>
#include <type_traits>
#include <omp.h>
#include <tuple>

#include "../parallel_backend_utils.h"
#include "../unseq_backend_simd.h"
Expand Down Expand Up @@ -153,6 +155,87 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu
__f(__first, __last);
}

namespace __detail
{

template <typename _ValueType, typename... _Args>
struct __enumerable_thread_local_storage
{
template <typename... _LocalArgs>
__enumerable_thread_local_storage(_LocalArgs&&... __args)
: __num_elements(0), __args(std::forward<_LocalArgs>(__args)...)
{
std::size_t __num_threads = omp_in_parallel() ? omp_get_num_threads() : omp_get_max_threads();
__thread_specific_storage.resize(__num_threads);
}

// Note: size should not be used concurrently with parallel loops which may instantiate storage objects, as it may
// not return an accurate count of instantiated storage objects in lockstep with the number allocated and stored.
// This is because the count is not atomic with the allocation and storage of the storage objects.
std::size_t
size() const
{
// only count storage which has been instantiated
return __num_elements.load();
}

// Note: get_with_id should not be used concurrently with parallel loops which may instantiate storage objects,
// as its operation may provide an out of date view of the stored objects based on the timing new object creation
// and incrementing of the size.
// TODO: Consider replacing this access with a visitor pattern.
_ValueType&
get_with_id(std::size_t __i)
{
assert(__i < size());

std::size_t __j = 0;

if (size() == __thread_specific_storage.size())
{
return *__thread_specific_storage[__i];
}

for (std::size_t __count = 0; __j < __thread_specific_storage.size() && __count <= __i; ++__j)
{
// Only include storage from threads which have instantiated a storage object
if (__thread_specific_storage[__j])
{
++__count;
}
}
// Need to back up one once we have found a valid storage object
return *__thread_specific_storage[__j - 1];
}

_ValueType&
get_for_current_thread()
{
std::size_t __i = omp_get_thread_num();
if (!__thread_specific_storage[__i])
{
// create temporary storage on first usage to avoid extra parallel region and unnecessary instantiation
__thread_specific_storage[__i] =
std::apply([](_Args... __arg_pack) { return std::make_unique<_ValueType>(__arg_pack...); }, __args);
__num_elements.fetch_add(1);
}
return *__thread_specific_storage[__i];
}

std::vector<std::unique_ptr<_ValueType>> __thread_specific_storage;
std::atomic_size_t __num_elements;
std::tuple<_Args...> __args;
};

} // namespace __detail

// enumerable thread local storage should only be created from make function
template <typename _ValueType, typename... Args>
__detail::__enumerable_thread_local_storage<_ValueType, Args...>
__make_enumerable_tls(Args&&... __args)
{
return __detail::__enumerable_thread_local_storage<_ValueType, Args...>(std::forward<Args>(__args)...);
}

} // namespace __omp_backend
} // namespace dpl
} // namespace oneapi
Expand Down
44 changes: 43 additions & 1 deletion include/oneapi/dpl/pstl/parallel_backend_serial.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <numeric>
#include <utility>
#include <type_traits>

#include "parallel_backend_utils.h"

namespace oneapi
Expand All @@ -34,6 +34,48 @@ namespace dpl
namespace __serial_backend
{

namespace __detail
{

template <typename _ValueType>
struct __enumerable_thread_local_storage
{
template <typename... _LocalArgs>
__enumerable_thread_local_storage(_LocalArgs&&... __args) : __storage(std::forward<_LocalArgs>(__args)...)
{
}

std::size_t
size() const
{
return std::size_t{1};
}

_ValueType&
get_for_current_thread()
{
return __storage;
}

_ValueType&
get_with_id(std::size_t /*__i*/)
{
return get_for_current_thread();
}

_ValueType __storage;
};

} //namespace __detail

// enumerable thread local storage should only be created from make function
template <typename _ValueType, typename... Args>
__detail::__enumerable_thread_local_storage<_ValueType>
__make_enumerable_tls(Args&&... __args)
{
return __detail::__enumerable_thread_local_storage<_ValueType>(std::forward<Args>(__args)...);
}

template <typename _ExecutionPolicy, typename _Tp>
using __buffer = oneapi::dpl::__utils::__buffer_impl<std::decay_t<_ExecutionPolicy>, _Tp, std::allocator>;

Expand Down
44 changes: 44 additions & 0 deletions include/oneapi/dpl/pstl/parallel_backend_tbb.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <tbb/parallel_invoke.h>
#include <tbb/task_arena.h>
#include <tbb/tbb_allocator.h>
#include <tbb/enumerable_thread_specific.h>
#if TBB_INTERFACE_VERSION > 12000
# include <tbb/task.h>
#endif
Expand Down Expand Up @@ -1306,6 +1307,49 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy
tbb::this_task_arena::isolate([&]() { tbb::parallel_for_each(__begin, __end, __f); });
}

namespace __detail
{

template <typename _ValueType>
struct __enumerable_thread_local_storage
{
template <typename... _LocalArgs>
__enumerable_thread_local_storage(_LocalArgs&&... __args)
: __thread_specific_storage(std::forward<_LocalArgs>(__args)...)
{
}

std::size_t
size() const
{
return __thread_specific_storage.size();
}

_ValueType&
get_for_current_thread()
{
return __thread_specific_storage.local();
}

_ValueType&
get_with_id(std::size_t __i)
{
return __thread_specific_storage.begin()[__i];
}

tbb::enumerable_thread_specific<_ValueType> __thread_specific_storage;
};

} // namespace __detail

// enumerable thread local storage should only be created from make function
template <typename _ValueType, typename... Args>
__detail::__enumerable_thread_local_storage<_ValueType>
__make_enumerable_tls(Args&&... __args)
{
return __detail::__enumerable_thread_local_storage<_ValueType>(std::forward<Args>(__args)...);
}

} // namespace __tbb_backend
} // namespace dpl
} // namespace oneapi
Expand Down
Loading

0 comments on commit 780705c

Please sign in to comment.