Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Host Implementation of Histogram APIs #1974

Merged
merged 66 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from 55 commits
Commits
Show all changes
66 commits
Select commit Hold shift + click to select a range
7bcd21e
histogram CPU initial implementation
danhoeflinger Nov 8, 2024
955f0e2
atomics histogram implementation
danhoeflinger Nov 8, 2024
ebcada6
clang format
danhoeflinger Nov 8, 2024
93dd493
comment about atomics
danhoeflinger Nov 8, 2024
4d95044
first draft of atomic increment (unchecked)
danhoeflinger Nov 12, 2024
a7fe8a1
atomics include and fix builtin
danhoeflinger Nov 12, 2024
6be0b7b
large case
danhoeflinger Nov 12, 2024
9e3f16c
fix threshold check
danhoeflinger Nov 15, 2024
0592c15
minor improvements
danhoeflinger Nov 19, 2024
6ac87a0
MSVC fixes
danhoeflinger Nov 20, 2024
af5defb
parallelize initialization of openMP temp histograms
danhoeflinger Nov 20, 2024
65c30af
removing unnecessary type casting
danhoeflinger Dec 13, 2024
eec36d5
improving accumulation of local histograms (simd)
danhoeflinger Dec 13, 2024
1faece5
Properly using IsVector
danhoeflinger Dec 14, 2024
a39accd
typo fix
danhoeflinger Dec 16, 2024
2cd184d
special handling thread zero to use global mem
danhoeflinger Dec 16, 2024
b3dd2a0
cleanup
danhoeflinger Dec 16, 2024
bff45d9
atomic version removal
danhoeflinger Dec 16, 2024
2c8cc04
Revert "cleanup"
danhoeflinger Dec 16, 2024
790121b
Revert "special handling thread zero to use global mem"
danhoeflinger Dec 16, 2024
0392f23
comments and cleanup
danhoeflinger Dec 16, 2024
4556399
handling coarser grained parallelism
danhoeflinger Dec 16, 2024
2fdea34
undo-ing broken thread restriction in openMP
danhoeflinger Dec 16, 2024
a0c016a
lift pattern up to algorithm_impl level
danhoeflinger Dec 18, 2024
e87ba02
cleanup unnecessary code
danhoeflinger Dec 18, 2024
9f74810
further cleanup / formatting
danhoeflinger Dec 18, 2024
e0ed14c
add grain size todo
danhoeflinger Dec 20, 2024
db2a474
more general thread local storage
danhoeflinger Dec 20, 2024
240447e
implement omp on demand tls
danhoeflinger Dec 30, 2024
741f8e3
formatting
danhoeflinger Dec 30, 2024
fd310b6
formatting
danhoeflinger Dec 30, 2024
6820340
comments and clarity
danhoeflinger Dec 30, 2024
4798d2c
bugfix for serial impl
danhoeflinger Dec 30, 2024
ce8b55f
removing debugging output
danhoeflinger Dec 30, 2024
5d1f6e8
formatting
danhoeflinger Dec 30, 2024
6630018
comment adjustment
danhoeflinger Dec 30, 2024
83a9f81
minor naming / formatting
danhoeflinger Dec 30, 2024
a3f1304
formatting
danhoeflinger Dec 30, 2024
14cca58
Address review feedback
danhoeflinger Jan 6, 2025
766f42c
formatting
danhoeflinger Jan 6, 2025
6425d37
address review feedback
danhoeflinger Jan 13, 2025
a590cac
address feedback
danhoeflinger Jan 13, 2025
dd35fc9
formatting
danhoeflinger Jan 13, 2025
455cf9c
Add comment about using `size()`
danhoeflinger Jan 13, 2025
8b094fe
fixing include errors
danhoeflinger Jan 13, 2025
7e1463e
formatting
danhoeflinger Jan 13, 2025
d9d4f08
adding const
danhoeflinger Jan 15, 2025
bebba5e
address feedback
danhoeflinger Jan 15, 2025
d494601
address feedback
danhoeflinger Jan 15, 2025
0a2847f
rename to __enumerable_thread_local_storage
danhoeflinger Jan 15, 2025
ee65630
address feedback
danhoeflinger Jan 21, 2025
f23132e
switching to == to be more clear of intention
danhoeflinger Jan 21, 2025
6606a80
restoring tbbmalloc as optional
danhoeflinger Jan 21, 2025
71ca3d8
simplifying construction of OMP
danhoeflinger Jan 21, 2025
c59a451
typo; missing underscores
danhoeflinger Jan 21, 2025
87e9780
Apply feedback
danhoeflinger Jan 22, 2025
d8f09aa
Remove large case as it is not needed for coverage
danhoeflinger Jan 22, 2025
2c114ee
moving histogram pattern to histogram_impl
danhoeflinger Jan 22, 2025
1b7f146
implement make function in backend
danhoeflinger Jan 22, 2025
a2fb53b
Adding comment for get_with_id
danhoeflinger Jan 22, 2025
397913f
clang format
danhoeflinger Jan 22, 2025
4db8104
rename make function
danhoeflinger Jan 22, 2025
66d15bc
fix includes
danhoeflinger Jan 22, 2025
1146bb6
rename variables away from global
danhoeflinger Jan 22, 2025
8b35ae6
clang format
danhoeflinger Jan 22, 2025
aa61385
simplify lambda using indices
danhoeflinger Jan 22, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -336,7 +336,7 @@ elseif(ONEDPL_BACKEND MATCHES "^(omp)$")
if (OpenMP_CXX_FLAGS MATCHES ".*-fiopenmp.*")
set(_openmp_flag -fopenmp)
elseif (OpenMP_CXX_FLAGS MATCHES ".*[-/]Qiopenmp.*")
set(_openmp_flag /Qopenmp)
set(_openmp_flag -Qopenmp)
endif()
if (_openmp_flag)
message(STATUS "Using ${_openmp_flag} for openMP")
Expand Down
89 changes: 89 additions & 0 deletions include/oneapi/dpl/pstl/algorithm_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,13 @@ namespace dpl
namespace __internal
{

template <typename _ValueType, typename... Args>
__par_backend::__enumerable_thread_local_storage<_ValueType, Args...>
__make_etls(Args&&... __args)
{
return __par_backend::__enumerable_thread_local_storage<_ValueType, Args...>(std::forward<Args>(__args)...);
}

//------------------------------------------------------------------------
// any_of
//------------------------------------------------------------------------
Expand Down Expand Up @@ -4289,6 +4296,88 @@ __pattern_shift_right(_Tag __tag, _ExecutionPolicy&& __exec, _BidirectionalItera
return __res.base();
}

template <class _ForwardIterator, class _IdxHashFunc, class _RandomAccessIterator, class _IsVector>
void
__brick_histogram(_ForwardIterator __first, _ForwardIterator __last, _IdxHashFunc __func,
_RandomAccessIterator __histogram_first, _IsVector) noexcept
{
for (; __first != __last; ++__first)
{
std::int32_t __bin = __func.get_bin(*__first);
if (__bin >= 0)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  1. Can get_bin return negative value?
  2. If yes, what is correct behavior for __brick_histogram?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, -1 is returned when the input element does not fall within any histogram bin. The correct behavior is to do nothing and skip this input element.
Specification
"Input values that do not map to a defined bin are skipped silently."

I recently looked into expanding the bin helper interface to include a separate function to check bounds, and another to get the bin which assumes it is in bounds. I thought this might provide benefit by reducing the number of branches by 1, but I saw no performance benefit from this change for CPU or GPU. It is still something we could pursue in the future.

{
++__histogram_first[__bin];
}
}
}

template <class _Tag, class _ExecutionPolicy, class _ForwardIterator, class _Size, class _IdxHashFunc,
class _RandomAccessIterator>
void
__pattern_histogram(_Tag, _ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last,
_Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator __histogram_first)
{
using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator>::value_type;
static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);
__pattern_fill(_Tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins,
_HistogramValueT{0});
__brick_histogram(__first, __last, __func, __histogram_first, typename _Tag::__is_vector{});
}

template <class _IsVector, class _ExecutionPolicy, class _RandomAccessIterator1, class _Size, class _IdxHashFunc,
class _RandomAccessIterator2>
void
__pattern_histogram(__parallel_tag<_IsVector>, _ExecutionPolicy&& __exec, _RandomAccessIterator1 __first,
_RandomAccessIterator1 __last, _Size __num_bins, _IdxHashFunc __func,
_RandomAccessIterator2 __histogram_first)
{
using __backend_tag = typename __parallel_tag<_IsVector>::__backend_tag;
using _HistogramValueT = typename std::iterator_traits<_RandomAccessIterator2>::value_type;
using _DiffType = typename std::iterator_traits<_RandomAccessIterator2>::difference_type;

_DiffType __n = __last - __first;
if (__n == 0)
{
// when n == 0, we must fill the output histogram with zeros
__pattern_fill(__parallel_tag<_IsVector>{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first,
__histogram_first + __num_bins, _HistogramValueT{0});
}
else
{
auto __tls =
oneapi::dpl::__internal::__make_etls<std::vector<_HistogramValueT>>(__num_bins, _HistogramValueT{0});

//main histogram loop
//TODO: add defaulted grain-size option for __parallel_for and use larger one here to account for overhead
__par_backend::__parallel_for(
__backend_tag{}, __exec, __first, __last,
[__func, &__tls](_RandomAccessIterator1 __first_local, _RandomAccessIterator1 __last_local) {
__internal::__brick_histogram(__first_local, __last_local, __func,
__tls.get_for_current_thread().begin(), _IsVector{});
});
// now accumulate temporary storage into output global histogram
const std::size_t __num_temporary_copies = __tls.size();
__par_backend::__parallel_for(
__backend_tag{}, std::forward<_ExecutionPolicy>(__exec), __histogram_first, __histogram_first + __num_bins,
[__num_temporary_copies, __histogram_first, &__tls](auto __global_histogram_first,
auto __global_histogram_last) {
const _DiffType __local_n = __global_histogram_last - __global_histogram_first;
const _DiffType __range_begin_id = __global_histogram_first - __histogram_first;
//initialize output global histogram with first local histogram via assign
__internal::__brick_walk2_n(__tls.get_with_id(0).begin() + __range_begin_id, __local_n,
__global_histogram_first, oneapi::dpl::__internal::__pstl_assign(),
_IsVector{});
for (std::size_t __i = 1; __i < __num_temporary_copies; ++__i)
{
//accumulate into output global histogram with other local histogram via += operator
__internal::__brick_walk2_n(
akukanov marked this conversation as resolved.
Show resolved Hide resolved
__tls.get_with_id(__i).begin() + __range_begin_id, __local_n, __global_histogram_first,
[](_HistogramValueT __x, _HistogramValueT& __y) { __y += __x; }, _IsVector{});
}
});
}
}

} // namespace __internal
} // namespace dpl
} // namespace oneapi
Expand Down
18 changes: 1 addition & 17 deletions include/oneapi/dpl/pstl/histogram_impl.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include "histogram_extension_defs.h"
#include "histogram_binhash_utils.h"
#include "iterator_impl.h"
#include "algorithm_impl.h"
akukanov marked this conversation as resolved.
Show resolved Hide resolved

#if _ONEDPL_HETERO_BACKEND
# include "hetero/histogram_impl_hetero.h"
Expand All @@ -29,23 +30,6 @@ namespace oneapi
namespace dpl
{

namespace __internal
{

template <class _Tag, typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _Size, typename _IdxHashFunc,
typename _RandomAccessIterator2>
void
__pattern_histogram(_Tag, _ExecutionPolicy&& exec, _RandomAccessIterator1 __first, _RandomAccessIterator1 __last,
_Size __num_bins, _IdxHashFunc __func, _RandomAccessIterator2 __histogram_first)
{
static_assert(__is_serial_tag_v<_Tag> || __is_parallel_forward_tag_v<_Tag>);

static_assert(sizeof(_Size) == 0 /*false*/,
"Histogram API is currently unsupported for policies other than device execution policies");
}

} // namespace __internal

template <typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _Size, typename _ValueType,
typename _RandomAccessIterator2>
oneapi::dpl::__internal::__enable_if_execution_policy<_ExecutionPolicy, _RandomAccessIterator2>
Expand Down
66 changes: 66 additions & 0 deletions include/oneapi/dpl/pstl/omp/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,13 @@
#include <atomic>
#include <iterator>
#include <cstddef>
#include <cstdint>
#include <cstdio>
#include <memory>
#include <vector>
#include <type_traits>
#include <omp.h>
#include <tuple>

#include "../parallel_backend_utils.h"
#include "../unseq_backend_simd.h"
Expand Down Expand Up @@ -153,6 +155,70 @@ __process_chunk(const __chunk_metrics& __metrics, _Iterator __base, _Index __chu
__f(__first, __last);
}

template <typename _ValueType, typename... _Args>
struct __enumerable_thread_local_storage
{
template <typename... _LocalArgs>
__enumerable_thread_local_storage(_LocalArgs&&... __args)
: __num_elements(0), __args(std::forward<_LocalArgs>(__args)...)
{
std::size_t __num_threads = omp_in_parallel() ? omp_get_num_threads() : omp_get_max_threads();
__thread_specific_storage.resize(__num_threads);
}

// Note: Size should not be used concurrently with parallel loops which may instantiate storage objects, as it may
// not return an accurate count of instantiated storage objects in lockstep with the number allocated and stored.
// This is because the count is not atomic with the allocation and storage of the storage objects.
std::size_t
size() const
{
// only count storage which has been instantiated
return __num_elements.load();
}

_ValueType&
get_with_id(std::size_t __i)
{
assert(__i < size());

std::size_t __j = 0;

if (size() == __thread_specific_storage.size())
{
return *__thread_specific_storage[__i];
}

for (std::size_t __count = 0; __j < __thread_specific_storage.size() && __count <= __i; ++__j)
{
// Only include storage from threads which have instantiated a storage object
if (__thread_specific_storage[__j])
{
__count++;
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
}
}
// Need to back up one once we have found a valid storage object
return *__thread_specific_storage[__j - 1];
}

_ValueType&
get_for_current_thread()
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
{
std::size_t __i = omp_get_thread_num();
if (!__thread_specific_storage[__i])
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
{
// create temporary storage on first usage to avoid extra parallel region and unnecessary instantiation
__thread_specific_storage[__i] =
std::apply([](_Args... __arg_pack) { return std::make_unique<_ValueType>(__arg_pack...); }, __args);
__num_elements.fetch_add(1);
}
return *__thread_specific_storage[__i];
}

std::vector<std::unique_ptr<_ValueType>> __thread_specific_storage;
std::atomic_size_t __num_elements;
std::tuple<_Args...> __args;
};

} // namespace __omp_backend
} // namespace dpl
} // namespace oneapi
Expand Down
31 changes: 30 additions & 1 deletion include/oneapi/dpl/pstl/parallel_backend_serial.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,11 @@

#include <algorithm>
#include <cstddef>
#include <cstdint>
#include <memory>
#include <numeric>
#include <utility>
#include <type_traits>

#include "parallel_backend_utils.h"

namespace oneapi
Expand All @@ -42,6 +42,35 @@ __cancel_execution(oneapi::dpl::__internal::__serial_backend_tag)
{
}

template <typename _ValueType, typename... _Args>
struct __enumerable_thread_local_storage
{
template <typename... _LocalArgs>
__enumerable_thread_local_storage(_LocalArgs&&... __args) : __storage(std::forward<_LocalArgs>(__args)...)
{
}

std::size_t
size() const
SergeyKopienko marked this conversation as resolved.
Show resolved Hide resolved
{
return std::size_t{1};
}

_ValueType&
get_for_current_thread()
{
return __storage;
}

_ValueType&
get_with_id(std::size_t /*__i*/)
{
return get_for_current_thread();
}

_ValueType __storage;
};

template <class _ExecutionPolicy, class _Index, class _Fp>
void
__parallel_for(oneapi::dpl::__internal::__serial_backend_tag, _ExecutionPolicy&&, _Index __first, _Index __last,
Expand Down
31 changes: 31 additions & 0 deletions include/oneapi/dpl/pstl/parallel_backend_tbb.h
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include <tbb/parallel_invoke.h>
#include <tbb/task_arena.h>
#include <tbb/tbb_allocator.h>
#include <tbb/enumerable_thread_specific.h>
#if TBB_INTERFACE_VERSION > 12000
# include <tbb/task.h>
#endif
Expand Down Expand Up @@ -1306,6 +1307,36 @@ __parallel_for_each(oneapi::dpl::__internal::__tbb_backend_tag, _ExecutionPolicy
tbb::this_task_arena::isolate([&]() { tbb::parallel_for_each(__begin, __end, __f); });
}

template <typename _ValueType, typename... _Args>
struct __enumerable_thread_local_storage
{
template <typename... _LocalArgs>
__enumerable_thread_local_storage(_LocalArgs&&... __args)
: __thread_specific_storage(std::forward<_LocalArgs>(__args)...)
{
}

std::size_t
size() const
{
return __thread_specific_storage.size();
}

_ValueType&
get_for_current_thread()
{
return __thread_specific_storage.local();
}

_ValueType&
get_with_id(std::size_t __i)
{
return __thread_specific_storage.begin()[__i];
}

tbb::enumerable_thread_specific<_ValueType> __thread_specific_storage;
};

} // namespace __tbb_backend
} // namespace dpl
} // namespace oneapi
Expand Down
Loading
Loading