diff --git a/CHANGES b/CHANGES index 330c0602b80..b3bebc316ee 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,41 @@ +------------------------------------------------------------------------ +The list of most significant changes made over time in Parallel STL. + +Parallel STL 20171127 release +PSTL_VERSION == 102 + +Features / APIs: + +- Added Parallel STL version macros: + PSTL_VERSION, PSTL_VERSION_MAJOR, PSTL_VERSION_MINOR. +- More algorithms support parallel and vector execution policies: + move, partition_copy, mismatch. +- More algorithms support parallel execution policies: + min_element, max_element, minmax_element. + +------------------------------------------------------------------------ +Parallel STL release within Intel(R) Parallel Studio XE 2018 Update 1 + +Features / APIs: + +- More algorithms support parallel and vector execution policies: + destroy, destroy_n, uninitialized_copy, uninitialized_copy_n, + uninitialized_default_construct, uninitialized_default_construct_n, + uninitialized_fill, uninitialized_fill_n, uninitialized_move, + uninitialized_move_n, uninitialized_value_construct, + uninitialized_value_construct_n. +- Improved performance in find_end and search algorithms. +- Added macro PSTL_USE_NONTEMPORAL_STORES that can improve performance + of copy, copy_n, fill, fill_n, generate, generate_n algorithms with + unseq and par_unseq policies; by default the macro is not defined. + +Bugs fixed: + +- Fixed transform_inclusive_scan to correctly process the first element. +- Fixed compile time error in sort algorithm when used with zip_iterator + and some other custom iterator types. +- Fixed several algorithms to allow use of non-const functors. + ------------------------------------------------------------------------ Parallel STL release within Intel(R) Parallel Studio XE 2018 diff --git a/README.md b/README.md index 5bc468d5f9e..f70f728f8ae 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,5 @@ # Parallel STL -[![Stable release](https://img.shields.io/badge/version-20170726-green.svg)](https://github.com/intel/parallelstl/releases/tag/20170726) +[![Stable release](https://img.shields.io/badge/version-20171127-green.svg)](https://github.com/intel/parallelstl/releases/tag/20171127) [![Apache License Version 2.0](https://img.shields.io/badge/license-Apache_2.0-green.svg)](LICENSE) Parallel STL is an implementation of the C++ standard library algorithms with support for execution policies, diff --git a/doc/Release_Notes.txt b/doc/Release_Notes.txt index c57a683b066..a72a6889782 100644 --- a/doc/Release_Notes.txt +++ b/doc/Release_Notes.txt @@ -8,8 +8,8 @@ System Requirements Parallel STL is available as a part of Intel(R) Parallel Studio XE 2018 and Intel(R) System Studio 2018. - Parallel STL distributions are validated and officially supported for -the hardware, software, operating systems and compilers listed here. + Parallel STL distributions are validated and officially supported +for the hardware, software, operating systems and compilers listed here. Hardware - Recommended @@ -71,6 +71,9 @@ Software - Supported Operating Systems Software - Supported Compilers Intel(R) C++ Compiler 16, 17 and 18 version + Note: Using Intel C++ Compiler 18.0 might result + in better performance for many of Parallel STL algorithms, + comparing to previous compiler versions. Microsoft* Visual C++ 14.0 (Microsoft* Visual Studio* 2015, Windows* OS only) Microsoft* Visual C++ 14.1 (Microsoft* Visual Studio* 2017, @@ -95,8 +98,8 @@ Known Issues or limitations ------------------------------------------------------------------------ -Intel, the Intel logo, Intel Core, Intel Atom, Xeon, Intel Xeon Phi, and -Pentium are trademarks of Intel Corporation in the U.S. and/or other -countries. +Intel, the Intel logo, Intel Core, Intel Atom, Xeon, Intel Xeon Phi, +and Pentium are trademarks of Intel Corporation in the U.S. and/or +other countries. * Other names and brands may be claimed as the property of others. diff --git a/examples/convex_hull/Makefile b/examples/convex_hull/Makefile index 4395c9d3ad5..a6e75bd3d51 100644 --- a/examples/convex_hull/Makefile +++ b/examples/convex_hull/Makefile @@ -22,17 +22,20 @@ PROG=convex_hull.exe ARGS= CXXFLAGS += -D__PSTL_USE_TBB -std=c++11 -# The C++ compiler + +# Set by default icc as C++ compiler if it's present ifneq (,$(shell which icc 2>/dev/null)) -CXX=icc +CXX = icc +endif # which icc +ifeq ($(CXX),icc) +CXXFLAGS += -qopenmp-simd ifneq (, $(filter $(target), mic)) CXXFLAGS += -mmic -else +else CXXFLAGS += -xHOST -endif -CXXFLAGS += -qopenmp-simd -endif # which icc +endif # target is mic or host? +endif # icc? ifeq ($(shell uname), Linux) LIBS+= -lrt diff --git a/examples/convex_hull/msvs/convex_hull.vcxproj b/examples/convex_hull/msvs/convex_hull.vcxproj index d073fde9521..093f60c21aa 100644 --- a/examples/convex_hull/msvs/convex_hull.vcxproj +++ b/examples/convex_hull/msvs/convex_hull.vcxproj @@ -28,14 +28,14 @@ Application true - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 Unicode true Application false - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 true Unicode true @@ -43,14 +43,14 @@ Application true - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 Unicode true Application false - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 true Unicode true diff --git a/examples/dot_product/Makefile b/examples/dot_product/Makefile index 5b7dd467241..2e70dc98252 100644 --- a/examples/dot_product/Makefile +++ b/examples/dot_product/Makefile @@ -22,17 +22,20 @@ PROG=dot_product.exe ARGS= CXXFLAGS += -D__PSTL_USE_TBB -std=c++11 -# The C++ compiler + +# Set by default icc as C++ compiler if it's present ifneq (,$(shell which icc 2>/dev/null)) -CXX=icc +CXX = icc +endif # which icc +ifeq ($(CXX),icc) +CXXFLAGS += -qopenmp-simd ifneq (, $(filter $(target), mic)) CXXFLAGS += -mmic -else +else CXXFLAGS += -xHOST -endif -CXXFLAGS += -qopenmp-simd -endif # which icc +endif # target is mic or host? +endif # icc? ifeq ($(shell uname), Linux) LIBS+= -lrt diff --git a/examples/dot_product/msvs/dot_product.vcxproj b/examples/dot_product/msvs/dot_product.vcxproj index 88c897950b2..00584190ab2 100644 --- a/examples/dot_product/msvs/dot_product.vcxproj +++ b/examples/dot_product/msvs/dot_product.vcxproj @@ -28,14 +28,14 @@ Application true - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 Unicode true Application false - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 true Unicode true @@ -43,14 +43,14 @@ Application true - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 Unicode true Application false - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 true Unicode true diff --git a/examples/gamma_correction/Makefile b/examples/gamma_correction/Makefile index d315e485c63..3ea680d528f 100644 --- a/examples/gamma_correction/Makefile +++ b/examples/gamma_correction/Makefile @@ -22,17 +22,20 @@ PROG=gamma_correction.exe ARGS= CXXFLAGS += -D__PSTL_USE_TBB -std=c++11 -# The C++ compiler + +# Set by default icc as C++ compiler if it's present ifneq (,$(shell which icc 2>/dev/null)) -CXX=icc +CXX = icc +endif # which icc +ifeq ($(CXX),icc) +CXXFLAGS += -qopenmp-simd ifneq (, $(filter $(target), mic)) CXXFLAGS += -mmic -else +else CXXFLAGS += -xHOST -endif -CXXFLAGS += -qopenmp-simd -endif # which icc +endif # target is mic or host? +endif # icc? ifeq ($(shell uname), Linux) LIBS+= -lrt diff --git a/examples/gamma_correction/msvs/gamma_correction.sln b/examples/gamma_correction/msvs/gamma_correction.sln index f5d204c585d..f968409992d 100644 --- a/examples/gamma_correction/msvs/gamma_correction.sln +++ b/examples/gamma_correction/msvs/gamma_correction.sln @@ -8,19 +8,19 @@ EndProject Global GlobalSection(SolutionConfigurationPlatforms) = preSolution Debug|x64 = Debug|x64 - Debug|x86 = Debug|x86 + Debug|Win32 = Debug|Win32 Release|x64 = Release|x64 - Release|x86 = Release|x86 + Release|Win32 = Release|Win32 EndGlobalSection GlobalSection(ProjectConfigurationPlatforms) = postSolution {33020498-816E-4A1D-A073-B0E4834AC979}.Debug|x64.ActiveCfg = Debug|x64 {33020498-816E-4A1D-A073-B0E4834AC979}.Debug|x64.Build.0 = Debug|x64 - {33020498-816E-4A1D-A073-B0E4834AC979}.Debug|x86.ActiveCfg = Debug|Win32 - {33020498-816E-4A1D-A073-B0E4834AC979}.Debug|x86.Build.0 = Debug|Win32 + {33020498-816E-4A1D-A073-B0E4834AC979}.Debug|Win32.ActiveCfg = Debug|Win32 + {33020498-816E-4A1D-A073-B0E4834AC979}.Debug|Win32.Build.0 = Debug|Win32 {33020498-816E-4A1D-A073-B0E4834AC979}.Release|x64.ActiveCfg = Release|x64 {33020498-816E-4A1D-A073-B0E4834AC979}.Release|x64.Build.0 = Release|x64 - {33020498-816E-4A1D-A073-B0E4834AC979}.Release|x86.ActiveCfg = Release|Win32 - {33020498-816E-4A1D-A073-B0E4834AC979}.Release|x86.Build.0 = Release|Win32 + {33020498-816E-4A1D-A073-B0E4834AC979}.Release|Win32.ActiveCfg = Release|Win32 + {33020498-816E-4A1D-A073-B0E4834AC979}.Release|Win32.Build.0 = Release|Win32 EndGlobalSection GlobalSection(SolutionProperties) = preSolution HideSolutionNode = FALSE diff --git a/examples/gamma_correction/msvs/gamma_correction.vcxproj b/examples/gamma_correction/msvs/gamma_correction.vcxproj index 63ceba6e568..b5ce12fc59f 100644 --- a/examples/gamma_correction/msvs/gamma_correction.vcxproj +++ b/examples/gamma_correction/msvs/gamma_correction.vcxproj @@ -28,14 +28,14 @@ Application true - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 Unicode true Application false - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 true Unicode true @@ -43,14 +43,14 @@ Application true - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 Unicode true Application false - Intel C++ Compiler 17.0 + Intel C++ Compiler 18.0 true Unicode true diff --git a/include/pstl/_internal/memory_impl.h b/include/pstl/_internal/memory_impl.h deleted file mode 100644 index 0dbd74203dc..00000000000 --- a/include/pstl/_internal/memory_impl.h +++ /dev/null @@ -1,415 +0,0 @@ -/* - Copyright (c) 2017 Intel Corporation - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - - - - -*/ - -#ifndef __PSTL_memory_impl_H -#define __PSTL_memory_impl_H - -#include -#include -#include "execution_policy_impl.h" - -namespace __icp_algorithm { - -//------------------------------------------------------------------------ -// uninitialized_copy -//------------------------------------------------------------------------ - -template -ForwardIterator brick_uninitialized_copy(InputIterator first, InputIterator last, ForwardIterator result, /*is_vector=*/std::false_type) noexcept { - return std::uninitialized_copy(first, last, result); -} - -template -ForwardIterator brick_uninitialized_copy(InputIterator first, InputIterator last, ForwardIterator result, /*is_vector=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Vectorial algorithm unimplemented, redirected to serial"); - return std::uninitialized_copy(first, last, result); -} - -template -ForwardIterator pattern_uninitialized_copy(InputIterator first, InputIterator last, ForwardIterator result, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - return brick_uninitialized_copy(first, last, result, is_vector); -} - -template -ForwardIterator pattern_uninitialized_copy(InputIterator first, InputIterator last, ForwardIterator result, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_uninitialized_copy(first, last, result, is_vector); -} - -//------------------------------------------------------------------------ -// uninitialized_copy_n -//------------------------------------------------------------------------ - -template -ForwardIterator brick_uninitialized_copy_n(InputIterator first, Size n, ForwardIterator result, /*is_vector=*/std::false_type) noexcept { - return std::uninitialized_copy_n(first, n, result); -} - -template -ForwardIterator brick_uninitialized_copy_n(InputIterator first, Size n, ForwardIterator result, /*is_vector=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Vectorial algorithm unimplemented, redirected to serial"); - return std::uninitialized_copy_n(first, n, result); -} - -template -ForwardIterator pattern_uninitialized_copy_n(InputIterator first, Size n, ForwardIterator result, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - return brick_uninitialized_copy_n(first, n, result, is_vector); -} - -template -ForwardIterator pattern_uninitialized_copy_n(InputIterator first, Size n, ForwardIterator result, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_uninitialized_copy_n(first, n, result, is_vector); -} - -//------------------------------------------------------------------------ -// uninitialized_move -//------------------------------------------------------------------------ - -template -void destroy_serial(ForwardIterator first, ForwardIterator last) { - typedef typename std::iterator_traits::value_type T; - while (first != last) { - (*first).~T(); - ++first; - } -} - -template -ForwardIterator brick_uninitialized_move(InputIterator first, InputIterator last, ForwardIterator result, /*is_vector=*/std::false_type) noexcept { - typedef typename std::iterator_traits::value_type Value; - ForwardIterator current = result; - - try { - while (first != last) - new (static_cast(std::addressof(*(result++)))) Value(std::move(*(first++))); - - return result; - } catch (...) { - destroy_serial(current, result); - std::terminate(); - } - - return result; -} - -template -ForwardIterator brick_uninitialized_move(InputIterator first, InputIterator last, ForwardIterator result, /*is_vector=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Vectorial algorithm unimplemented, redirected to serial"); - return brick_uninitialized_move(first, last, result, std::false_type()); -} - -template -ForwardIterator pattern_uninitialized_move(InputIterator first, InputIterator last, ForwardIterator result, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - return brick_uninitialized_move(first, last, result, is_vector); -} - -template -ForwardIterator pattern_uninitialized_move(InputIterator first, InputIterator last, ForwardIterator result, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_uninitialized_move(first, last, result, is_vector); -} - -//------------------------------------------------------------------------ -// uninitialized_move_n -//------------------------------------------------------------------------ - -template -ForwardIterator brick_uninitialized_move_n(InputIterator first, Size n, ForwardIterator result, /*is_vector=*/std::false_type) noexcept { - typedef typename std::iterator_traits::value_type Value; - ForwardIterator current = result; - - try { - while (n-- > 0) - new (static_cast(std::addressof(*(result++)))) Value(std::move(*(first++))); - - return result; - } catch (...) { - destroy_serial(current, result); - std::terminate(); - } - - return result; -} - -template -ForwardIterator brick_uninitialized_move_n(InputIterator first, Size n, ForwardIterator result, /*is_vector=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Vectorial algorithm unimplemented, redirected to serial"); - return brick_uninitialized_move_n(first, n, result, std::false_type()); -} - -template -ForwardIterator pattern_uninitialized_move_n(InputIterator first, Size n, ForwardIterator result, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - return brick_uninitialized_move_n(first, n, result, is_vector); -} - -template -ForwardIterator pattern_uninitialized_move_n(InputIterator first, Size n, ForwardIterator result, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_uninitialized_move_n(first, n, result, is_vector); -} - -//------------------------------------------------------------------------ -// uninitialized_fill -//------------------------------------------------------------------------ - -template -void brick_uninitialized_fill(ForwardIterator first, ForwardIterator last, const T& x, /*is_vector=*/std::false_type) noexcept { - std::uninitialized_fill(first, last, x); -} - -template -void brick_uninitialized_fill(ForwardIterator first, ForwardIterator last, const T& x, /*is_vector=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Vectorial algorithm unimplemented, redirected to serial"); - std::uninitialized_fill(first, last, x); -} - -template -void pattern_uninitialized_fill(ForwardIterator first, ForwardIterator last, const T& x, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - brick_uninitialized_fill(first, last, x, is_vector); -} - -template -void pattern_uninitialized_fill(ForwardIterator first, ForwardIterator last, const T& x, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - brick_uninitialized_fill(first, last, x, is_vector); -} - -//------------------------------------------------------------------------ -// uninitialized_fill_n -//------------------------------------------------------------------------ -// Some C++11 compilers don't have a version of the algorithm std::uninitialized_fill_n that returns an iterator to the element past the last element filled. -template< class ForwardIterator, class Size, class T > -ForwardIterator uninitialized_fill_n_serial(ForwardIterator first, Size n, const T& x) -{ - typedef typename std::iterator_traits::value_type Value; - auto cur = first; - try { - while (n--) { - ::new (static_cast(std::addressof(*cur))) Value(x); - ++cur; - } - return cur; - } - catch (...) { - destroy_serial(first, cur); - std::terminate(); - } -} - -template -ForwardIterator brick_uninitialized_fill_n(ForwardIterator first, Size n, const T& x, /*is_vector=*/std::false_type) noexcept { - return uninitialized_fill_n_serial(first, n, x); -} - -template -ForwardIterator brick_uninitialized_fill_n(ForwardIterator first, Size n, const T& x, /*is_vector=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Vectorial algorithm unimplemented, redirected to serial"); - return brick_uninitialized_fill_n(first, n, x, std::false_type()); -} - -template -ForwardIterator pattern_uninitialized_fill_n(ForwardIterator first, Size n, const T& x, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - return brick_uninitialized_fill_n(first, n, x, is_vector); -} - -template -ForwardIterator pattern_uninitialized_fill_n(ForwardIterator first, Size n, const T& x, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_uninitialized_fill_n(first, n, x, is_vector); -} - -//------------------------------------------------------------------------ -// destroy -//------------------------------------------------------------------------ - -template -void brick_destroy(ForwardIterator first, ForwardIterator last, /*is_vector=*/std::false_type) noexcept { - destroy_serial(first, last); -} - -template -void brick_destroy(ForwardIterator first, ForwardIterator last, /*is_vector=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Vectorial algorithm unimplemented, redirected to serial"); - brick_destroy(first, last, std::false_type()); -} - -template -void pattern_destroy(ForwardIterator first, ForwardIterator last, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - brick_destroy(first, last, is_vector); -} - -template -void pattern_destroy(ForwardIterator first, ForwardIterator last, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - brick_destroy(first, last, is_vector); -} - -//------------------------------------------------------------------------ -// destroy_n -//------------------------------------------------------------------------ - -template -ForwardIterator destroy_n_serial(ForwardIterator first, Size n) { - typedef typename std::iterator_traits::value_type T; - while (n--) { - (*first).~T(); - ++first; - } - return first; -} - -template -ForwardIterator brick_destroy_n(ForwardIterator first, Size n, /*is_vector=*/std::false_type) noexcept { - return destroy_n_serial(first, n); -} - -template -ForwardIterator brick_destroy_n(ForwardIterator first, Size n, /*is_vector=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Vectorial algorithm unimplemented, redirected to serial"); - return brick_destroy_n(first, n, std::false_type()); -} - -template -ForwardIterator pattern_destroy_n(ForwardIterator first, Size n, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - return brick_destroy_n(first, n, is_vector); -} - -template -ForwardIterator pattern_destroy_n(ForwardIterator first, Size n, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { -__PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_destroy_n(first, n, is_vector); -} - -//------------------------------------------------------------------------ -// uninitialized_default_construct -//------------------------------------------------------------------------ -template -struct Construct { - void operator()(void* ptr) { - ::new (ptr) T; - } -}; - -template -struct Construct { - void operator()(void* ptr) { - ::new (ptr) T(); - } -}; - -template -void brick_uninitialized_construct(ForwardIterator first, ForwardIterator last, /*is_vector=*/std::false_type) noexcept { - typedef typename std::iterator_traits::value_type value_type; - auto cur = first; // Save the iterator for catching exceptions - try { - for (; cur != last; ++cur) - Construct()(static_cast(std::addressof(*cur))); - } - catch (...) { - destroy_serial(first, cur); - std::terminate(); - } -} - -template -void brick_uninitialized_construct(ForwardIterator first, ForwardIterator last, /*is_vector=*/std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Vectorial algorithm unimplemented, redirected to serial"); - brick_uninitialized_construct(first, last, std::false_type()); -} - -template -void pattern_uninitialized_default_construct(ForwardIterator first, ForwardIterator last, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - brick_uninitialized_construct(first, last, is_vector); -} - -template -void pattern_uninitialized_default_construct(ForwardIterator first, ForwardIterator last, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - brick_uninitialized_construct(first, last, is_vector); -} - -//------------------------------------------------------------------------ -// uninitialized_default_construct_n -//------------------------------------------------------------------------ - -template -ForwardIterator brick_uninitialized_construct_n(ForwardIterator first, Size n, /*is_vector=*/std::false_type) noexcept { - typedef typename std::iterator_traits::value_type value_type; - auto cur = first; - try { - for (; n > 0; ++cur, --n) - Construct()(static_cast(std::addressof(*cur))); - return cur; - } - catch (...) { - destroy_serial(first, cur); - std::terminate(); - } -} - -template -ForwardIterator brick_uninitialized_construct_n(ForwardIterator first, Size n, /*is_vector=*/std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Vectorial algorithm unimplemented, redirected to serial"); - return brick_uninitialized_construct_n(first, n, std::false_type()); -} - -template -ForwardIterator pattern_uninitialized_default_construct_n(ForwardIterator first, Size n, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - return brick_uninitialized_construct_n(first, n, is_vector); -} - -template -ForwardIterator pattern_uninitialized_default_construct_n(ForwardIterator first, Size n, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_uninitialized_construct_n(first, n, is_vector); -} - -//------------------------------------------------------------------------ -// uninitialized_value_construct -//------------------------------------------------------------------------ - -template -void pattern_uninitialized_value_construct(ForwardIterator first, ForwardIterator last, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - brick_uninitialized_construct(first, last, is_vector); -} - -template -void pattern_uninitialized_value_construct(ForwardIterator first, ForwardIterator last, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - brick_uninitialized_construct(first, last, is_vector); -} - -//------------------------------------------------------------------------ -// uninitialized_value_construct_n -//------------------------------------------------------------------------ - -template -ForwardIterator pattern_uninitialized_value_construct_n(ForwardIterator first, Size n, IsVector is_vector, /*is_parallel=*/std::false_type) noexcept { - return brick_uninitialized_construct_n(first, n, is_vector); -} - -template -ForwardIterator pattern_uninitialized_value_construct_n(ForwardIterator first, Size n, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_uninitialized_construct_n(first, n, is_vector); -} -} // namespace __icp_algorithm -#endif //__PSTL_memory_impl_H diff --git a/include/pstl/algorithm b/include/pstl/algorithm index 6cc7952b513..d6a18ddc5a3 100644 --- a/include/pstl/algorithm +++ b/include/pstl/algorithm @@ -23,136 +23,137 @@ #include -#include "_internal/pstl_config.h" -#include "_internal/common.h" -#include "_internal/simd_impl.h" -#include "_internal/algorithm_impl.h" -#include "_internal/numeric_impl.h" /* count and count_if use pattern_transform_reduce */ -#if __PSTL_USE_TBB - #include "_internal/parallel_impl_tbb.h" -#else - __PSTL_PRAGMA_MESSAGE("Backend was not specified"); -#endif +#include "internal/pstl_config.h" +#include "internal/common.h" +#include "internal/simd_impl.h" +#include "internal/algorithm_impl.h" +#include "internal/numeric_impl.h" /* count and count_if use pattern_transform_reduce */ namespace std { // [alg.any_of] template -__icp_algorithm::enable_if_execution_policy -any_of( ExecutionPolicy&& exec, InputIterator first, InputIterator last, Predicate pred ) { - return __icp_algorithm::pattern_any_of( first, last, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); +pstl::internal::enable_if_execution_policy +any_of(ExecutionPolicy&& exec, InputIterator first, InputIterator last, Predicate pred) { + using namespace pstl::internal; + return pattern_any_of( first, last, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } // [alg.all_of] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy all_of(ExecutionPolicy&& exec, InputIterator first, InputIterator last, Pred pred) { - return !any_of(std::forward(exec), first, last, __icp_algorithm::not_pred(pred)); + return !any_of(std::forward(exec), first, last, pstl::internal::not_pred(pred)); } // [alg.none_of] template -__icp_algorithm::enable_if_execution_policy -none_of( ExecutionPolicy&& exec, InputIterator first, InputIterator last, Predicate pred ) { +pstl::internal::enable_if_execution_policy +none_of(ExecutionPolicy&& exec, InputIterator first, InputIterator last, Predicate pred) { return !any_of( std::forward(exec), first, last, pred ); } // [alg.foreach] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy for_each(ExecutionPolicy&& exec, InputIterator first, InputIterator last, Function f) { - __icp_algorithm::pattern_walk1( + using namespace pstl::internal; + pattern_walk1( first, last, f, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy for_each_n(ExecutionPolicy&& exec, InputIterator first, Size n, Function f) { - return __icp_algorithm::pattern_walk1_n( first, n, f, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_walk1_n(first, n, f, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } // [alg.find] template -__icp_algorithm::enable_if_execution_policy -find_if(ExecutionPolicy&& exec, InputIterator first, InputIterator last, -Predicate pred) { - return __icp_algorithm::pattern_find_if( first, last, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); +pstl::internal::enable_if_execution_policy +find_if(ExecutionPolicy&& exec, InputIterator first, InputIterator last, Predicate pred) { + using namespace pstl::internal; + return pattern_find_if( first, last, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy find_if_not(ExecutionPolicy&& exec, InputIterator first, InputIterator last, Predicate pred) { - return find_if(exec,first,last,__icp_algorithm::not_pred(pred)); + return find_if(exec, first, last, pstl::internal::not_pred(pred)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy find(ExecutionPolicy&& exec, InputIterator first, InputIterator last, const T& value) { - return find_if(exec, first, last, __icp_algorithm::equal_value(value)); + return find_if(exec, first, last, pstl::internal::equal_value(value)); } // [alg.find.end] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy find_end(ExecutionPolicy &&exec, ForwardIterator1 first, ForwardIterator1 last, ForwardIterator2 s_first, ForwardIterator2 s_last, BinaryPredicate pred) { - return __icp_algorithm::pattern_find_end(first, last, s_first, s_last, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_find_end(first, last, s_first, s_last, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy find_end(ExecutionPolicy&& exec, ForwardIterator1 first, ForwardIterator1 last, ForwardIterator2 s_first, ForwardIterator2 s_last) { return find_end(exec, first, last, s_first, s_last, std::equal_to::value_type>()); } // [alg.find_first_of] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy find_first_of(ExecutionPolicy&& exec, InputIterator first, InputIterator last, ForwardIterator s_first, ForwardIterator s_last, BinaryPredicate pred) { - return __icp_algorithm::pattern_find_first_of(first, last, s_first, s_last, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_find_first_of(first, last, s_first, s_last, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy find_first_of(ExecutionPolicy&& exec, InputIterator first, InputIterator last, ForwardIterator s_first, ForwardIterator s_last) { return find_first_of(exec, first, last, s_first, s_last, std::equal_to::value_type>()); } // [alg.adjacent_find] template< class ExecutionPolicy, class ForwardIt > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy adjacent_find(ExecutionPolicy&& exec, ForwardIt first, ForwardIt last) { typedef typename iterator_traits::value_type value_type; - - return __icp_algorithm::pattern_adjacent_find(first, last, std::equal_to(), - __icp_algorithm::is_parallelization_preferred(exec), - __icp_algorithm::is_vectorization_preferred(exec), /*first_semantic*/ false); + using namespace pstl::internal; + return pattern_adjacent_find(first, last, std::equal_to(), + is_parallelization_preferred(exec), + is_vectorization_preferred(exec), /*first_semantic*/ false); } template< class ExecutionPolicy, class ForwardIt, class BinaryPredicate> -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy adjacent_find(ExecutionPolicy&& exec, ForwardIt first, ForwardIt last, BinaryPredicate pred) { -return __icp_algorithm::pattern_adjacent_find(first, last, pred, - __icp_algorithm::is_parallelization_preferred(exec), - __icp_algorithm::is_vectorization_preferred(exec), /*first_semantic*/ false); + using namespace pstl::internal; + return pattern_adjacent_find(first, last, pred, + is_parallelization_preferred(exec), + is_vectorization_preferred(exec), /*first_semantic*/ false); } // [alg.count] @@ -161,35 +162,37 @@ return __icp_algorithm::pattern_adjacent_find(first, last, pred, // so that we do not have to include . template -__icp_algorithm::enable_if_execution_policy::difference_type> +pstl::internal::enable_if_execution_policy::difference_type> count(ExecutionPolicy&& exec, InputIterator first, InputIterator last, const T& value) { typedef typename iterator_traits::value_type value_type; - - return __icp_algorithm::pattern_count(first, last, [&value](value_type x) {return value==x;}, - __icp_algorithm::is_parallelization_preferred(exec), - __icp_algorithm::is_vectorization_preferred(exec)); + using namespace pstl::internal; + return pattern_count(first, last, [&value](value_type x) {return value==x;}, + is_parallelization_preferred(exec), + is_vectorization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy::difference_type> +pstl::internal::enable_if_execution_policy::difference_type> count_if(ExecutionPolicy&& exec, InputIterator first, InputIterator last, Predicate pred) { - return __icp_algorithm::pattern_count(first, last, pred, - __icp_algorithm::is_parallelization_preferred(exec), - __icp_algorithm::is_vectorization_preferred(exec)); + using namespace pstl::internal; + return pattern_count(first, last, pred, + is_parallelization_preferred(exec), + is_vectorization_preferred(exec)); } // [alg.search] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy search(ExecutionPolicy&& exec, ForwardIterator1 first, ForwardIterator1 last, ForwardIterator2 s_first, ForwardIterator2 s_last, BinaryPredicate pred) { - return __icp_algorithm::pattern_search(first, last, s_first, s_last, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_search(first, last, s_first, s_last, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy search(ExecutionPolicy&& exec, ForwardIterator1 first, ForwardIterator1 last, ForwardIterator2 s_first, ForwardIterator2 s_last) { typedef typename iterator_traits::value_type value_type; @@ -197,15 +200,16 @@ search(ExecutionPolicy&& exec, ForwardIterator1 first, ForwardIterator1 last, Fo } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy search_n(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, Size count, const T& value, BinaryPredicate pred) { - return __icp_algorithm::pattern_search_n(first, last, count, value, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_search_n(first, last, count, value, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy search_n(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, Size count, const T& value) { return search_n(exec, first, last, count, value, std::equal_to::value_type>()); } @@ -213,286 +217,311 @@ search_n(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, Si // [alg.copy] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy copy(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result) { - return __icp_algorithm::pattern_copy( - first, last, result, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + const auto is_vector = is_vectorization_preferred(exec); + + return pattern_walk2_brick(first, last, result, [is_vector](InputIterator begin, InputIterator end, OutputIterator res){ + return brick_copy(begin, end, res, is_vector); + }, is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy copy_n(ExecutionPolicy&& exec, InputIterator first, Size n, OutputIterator result) { - return __icp_algorithm::pattern_copy_n( - first, n, result, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + const auto is_vector = is_vectorization_preferred(exec); + + return pattern_walk2_brick_n(first, n, result, [is_vector](InputIterator begin, Size sz, OutputIterator res){ + return brick_copy_n(begin, sz, res, is_vector); + }, is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy copy_if(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, Predicate pred) { - return __icp_algorithm::pattern_copy_if( + using namespace pstl::internal; + return pattern_copy_if( first, last, result, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } // [alg.swap] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy swap_ranges(ExecutionPolicy&& exec, ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2) { - return __icp_algorithm::pattern_swap_ranges(first1, last1, first2, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_swap_ranges(first1, last1, first2, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } // [alg.transform] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy transform( ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, UnaryOperation op ) { typedef typename iterator_traits::value_type input_type; typedef typename iterator_traits::value_type output_type; - return __icp_algorithm::pattern_walk2(first, last, result, - [op]( input_type x, output_type& y ) mutable {y = op(x);}, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_walk2(first, last, result, + [op](input_type x, output_type& y ) mutable { y = op(x);}, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy transform( ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, OutputIterator result, BinaryOperation op ) { typedef typename iterator_traits::value_type input1_type; typedef typename iterator_traits::value_type input2_type; typedef typename iterator_traits::value_type output_type; - return __icp_algorithm::pattern_walk3(first1, last1, first2, result, [op]( input1_type x, input2_type y, output_type& z ) mutable {z = op(x,y);}, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_walk3(first1, last1, first2, result, [op]( input1_type x, input2_type y, output_type& z ) mutable {z = op(x,y);}, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } // [alg.replace] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy replace_if(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, UnaryPredicate pred, const T& new_value) { - __icp_algorithm::pattern_replace_if(first, last, pred, new_value, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + pattern_replace_if(first, last, pred, new_value, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy replace(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, const T& old_value, const T& new_value) { - replace_if(exec, first, last, __icp_algorithm::equal_value(old_value), new_value); + replace_if(exec, first, last, pstl::internal::equal_value(old_value), new_value); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy replace_copy_if(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, UnaryPredicate pred, const T& new_value) { typedef typename iterator_traits::value_type input_type; typedef typename iterator_traits::value_type output_type; - return __icp_algorithm::pattern_walk2( + using namespace pstl::internal; + return pattern_walk2( first, last, result, [pred, &new_value](input_type x, output_type& y) mutable { y = pred(x) ? new_value : x; }, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy replace_copy(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, const T& old_value, const T& new_value) { - return replace_copy_if(exec, first, last, result, __icp_algorithm::equal_value(old_value), new_value); + return replace_copy_if(exec, first, last, result, pstl::internal::equal_value(old_value), new_value); } // [alg.fill] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy fill( ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, const T& value ) { - __icp_algorithm::pattern_fill(first, last, value, - __icp_algorithm::is_parallelization_preferred(exec), - __icp_algorithm::is_vectorization_preferred(exec)); + using namespace pstl::internal; + pattern_fill(first, last, value, + is_parallelization_preferred(exec), + is_vectorization_preferred(exec)); } template< class ExecutionPolicy, class OutputIterator, class Size, class T> -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy fill_n( ExecutionPolicy&& exec, OutputIterator first, Size count, const T& value ) { if(count <= 0) return first; - return __icp_algorithm::pattern_fill_n(first, count, value, - __icp_algorithm::is_parallelization_preferred(exec), - __icp_algorithm::is_vectorization_preferred(exec)); + using namespace pstl::internal; + return pattern_fill_n(first, count, value, + is_parallelization_preferred(exec), + is_vectorization_preferred(exec)); } // [alg.generate] template< class ExecutionPolicy, class ForwardIterator, class Generator> -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy generate( ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, Generator g ) { - __icp_algorithm::pattern_generate(first, last, g, - __icp_algorithm::is_parallelization_preferred(exec), - __icp_algorithm::is_vectorization_preferred(exec)); + using namespace pstl::internal; + pattern_generate(first, last, g, + is_parallelization_preferred(exec), + is_vectorization_preferred(exec)); } template< class ExecutionPolicy, class OutputIterator, class Size, class Generator> -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy generate_n( ExecutionPolicy&& exec, OutputIterator first, Size count, Generator g ) { if(count <= 0) return first; - return __icp_algorithm::pattern_generate_n(first, count, g, - __icp_algorithm::is_parallelization_preferred(exec), - __icp_algorithm::is_vectorization_preferred(exec)); + using namespace pstl::internal; + return pattern_generate_n(first, count, g, + is_parallelization_preferred(exec), + is_vectorization_preferred(exec)); } // [alg.remove] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy remove_copy_if(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, Predicate pred) { - return copy_if( exec, first, last, result, __icp_algorithm::not_pred(pred)); + return copy_if( exec, first, last, result, pstl::internal::not_pred(pred)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy remove_copy(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, const T& value) { - return copy_if( exec, first, last, result, __icp_algorithm::not_equal_value(value)); + return copy_if( exec, first, last, result, pstl::internal::not_equal_value(value)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy remove_if(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, UnaryPredicate pred) { - return __icp_algorithm::pattern_remove_if(first, last, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_remove_if(first, last, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy remove(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, const T& value) { - return remove_if(exec, first, last, __icp_algorithm::equal_value(value)); + return remove_if(exec, first, last, pstl::internal::equal_value(value)); } // [alg.unique] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy unique(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, BinaryPredicate pred) { - return __icp_algorithm::pattern_unique(first, last, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_unique(first, last, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy unique(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last) { - return unique(exec, first, last, __icp_algorithm::pstl_equal()); + return unique(exec, first, last, pstl::internal::pstl_equal()); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy unique_copy(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, BinaryPredicate pred) { - return __icp_algorithm::pattern_unique_copy(first, last, result, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_unique_copy(first, last, result, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy unique_copy(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result) { - return unique_copy( exec, first, last, result, __icp_algorithm::pstl_equal() ); + return unique_copy( exec, first, last, result, pstl::internal::pstl_equal() ); } // [alg.reverse] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy reverse(ExecutionPolicy&& exec, BidirectionalIterator first, BidirectionalIterator last) { - __icp_algorithm::pattern_reverse(first, last, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + pattern_reverse(first, last, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy reverse_copy(ExecutionPolicy&& exec, BidirectionalIterator first, BidirectionalIterator last, OutputIterator d_first) { - return __icp_algorithm::pattern_reverse_copy(first, last, d_first, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_reverse_copy(first, last, d_first, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } // [alg.rotate] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy rotate(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator middle, ForwardIterator last) { - return __icp_algorithm::pattern_rotate(first, middle, last, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_rotate(first, middle, last, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy rotate_copy(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator middle, ForwardIterator last, OutputIterator result) { - return __icp_algorithm::pattern_rotate_copy(first, middle, last, result, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_rotate_copy(first, middle, last, result, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } // [alg.partitions] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy is_partitioned(ExecutionPolicy&& exec, InputIterator first, InputIterator last, UnaryPredicate pred) { - return __icp_algorithm::pattern_is_partitioned(first, last, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_is_partitioned(first, last, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy partition(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, UnaryPredicate pred) { - return __icp_algorithm::pattern_partition(first, last, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_partition(first, last, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy stable_partition(ExecutionPolicy&& exec, BidirectionalIterator first, BidirectionalIterator last, UnaryPredicate pred) { - return __icp_algorithm::pattern_stable_partition(first, last, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_stable_partition(first, last, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy> +pstl::internal::enable_if_execution_policy> partition_copy(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator1 out_true, OutputIterator2 out_false, UnaryPredicate pred) { - return __icp_algorithm::pattern_partition_copy(first, last, out_true, out_false, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_partition_copy(first, last, out_true, out_false, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } // [alg.sort] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy sort(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp) { typedef typename iterator_traits::value_type input_type; - - return __icp_algorithm::pattern_sort(first, last, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec), + using namespace pstl::internal; + return pattern_sort(first, last, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec), typename std::is_move_constructible::type()); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy sort(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last) { typedef typename iterator_traits::value_type input_type; sort(exec, first, last, std::less()); @@ -501,17 +530,17 @@ sort(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator la // [stable.sort] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy stable_sort(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp) { typedef typename iterator_traits::value_type input_type; - - return __icp_algorithm::pattern_stable_sort(first, last, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_stable_sort(first, last, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy stable_sort(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last) { typedef typename iterator_traits::value_type input_type; stable_sort(exec, first, last, std::less()); @@ -520,28 +549,29 @@ stable_sort(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIter // [mismatch] template< class ExecutionPolicy, class InputIterator1, class InputIterator2, class BinaryPredicate > -__icp_algorithm::enable_if_execution_policy> +pstl::internal::enable_if_execution_policy> mismatch(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, BinaryPredicate pred) { - return __icp_algorithm::pattern_mismatch(first1, last1, first2, last2, pred, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_mismatch(first1, last1, first2, last2, pred, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template< class ExecutionPolicy, class InputIterator1, class InputIterator2, class BinaryPredicate > -__icp_algorithm::enable_if_execution_policy> +pstl::internal::enable_if_execution_policy> mismatch(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate pred) { return mismatch(exec, first1, last1, first2, std::next(first2, std::distance(first1, last1)), pred); } template< class ExecutionPolicy, class InputIterator1, class InputIterator2 > -__icp_algorithm::enable_if_execution_policy> +pstl::internal::enable_if_execution_policy> mismatch(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2) { typedef typename iterator_traits::value_type value_type; return mismatch(exec, first1, last1, first2, last2, std::equal_to()); } template< class ExecutionPolicy, class InputIterator1, class InputIterator2 > -__icp_algorithm::enable_if_execution_policy> +pstl::internal::enable_if_execution_policy> mismatch(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2) { return mismatch(exec, first1, last1, first2, std::next(first2, std::distance(first1, last1))); } @@ -549,22 +579,23 @@ mismatch(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, In // [alg.equal] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy equal(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate p) { - return __icp_algorithm::pattern_equal(first1, last1, first2, p, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec) + using namespace pstl::internal; + return pattern_equal(first1, last1, first2, p, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec) ); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy equal(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2) { - return equal(exec, first1, last1, first2, __icp_algorithm::pstl_equal()); + return equal(exec, first1, last1, first2, pstl::internal::pstl_equal()); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy equal(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, BinaryPredicate p) { if ( std::distance(first1, last1) == std::distance(first2, last2) ) return std::equal(first1, last1, first2, p); @@ -573,35 +604,36 @@ equal(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, Input } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy equal(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2) { - if ( std::distance(first1, last1) == std::distance(first2, last2) ) - return equal(first1, last1, first2, __icp_algorithm::pstl_equal()); - else - return false; + return equal(first1, last1, first2, pstl::internal::pstl_equal()); } // [alg.move] template< class ExecutionPolicy, class InputIterator, class OutputIterator > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy move(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator d_first) { - return __icp_algorithm::pattern_move(first, last, d_first, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + const auto is_vector = is_vectorization_preferred(exec); + + return pattern_walk2_brick(first, last, d_first, [is_vector](InputIterator begin, InputIterator end, OutputIterator res) { + return brick_move(begin, end, res, is_vector); + }, is_parallelization_preferred(exec)); } // [partial.sort] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy partial_sort(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator middle, RandomAccessIterator last, Compare comp) { - __icp_algorithm::pattern_partial_sort(first, middle, last, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + pattern_partial_sort(first, middle, last, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy partial_sort(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator middle, RandomAccessIterator last) { typedef typename iterator_traits::value_type input_type; partial_sort(exec, first, middle, last, std::less()); @@ -610,15 +642,16 @@ partial_sort(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIte // [partial.sort.copy] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy partial_sort_copy(ExecutionPolicy&& exec, InputIterator first, InputIterator last, RandomAccessIterator d_first, RandomAccessIterator d_last, Compare comp) { - return __icp_algorithm::pattern_partial_sort_copy(first, last, d_first, d_last, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_partial_sort_copy(first, last, d_first, d_last, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy partial_sort_copy(ExecutionPolicy&& exec, InputIterator first, InputIterator last, RandomAccessIterator d_first, RandomAccessIterator d_last) { typedef typename iterator_traits::value_type input_type; return partial_sort_copy(exec, first, last, d_first, d_last, std::less()); @@ -626,31 +659,33 @@ partial_sort_copy(ExecutionPolicy&& exec, InputIterator first, InputIterator las // [is.sorted] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy is_sorted_until(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, Compare comp) { - const ForwardIterator res = __icp_algorithm::pattern_adjacent_find(first, last, __icp_algorithm::reorder_pred(comp), - __icp_algorithm::is_parallelization_preferred(exec), - __icp_algorithm::is_vectorization_preferred(exec), /*first_semantic*/ false); + using namespace pstl::internal; + const ForwardIterator res = pattern_adjacent_find(first, last, pstl::internal::reorder_pred(comp), + is_parallelization_preferred(exec), + is_vectorization_preferred(exec), /*first_semantic*/ false); return res==last ? last : std::next(res); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy is_sorted_until(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last) { typedef typename iterator_traits::value_type input_type; return is_sorted_until(exec, first, last, std::less()); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy is_sorted(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, Compare comp) { - return __icp_algorithm::pattern_adjacent_find(first, last, __icp_algorithm::reorder_pred(comp), - __icp_algorithm::is_parallelization_preferred(exec), - __icp_algorithm::is_vectorization_preferred(exec), /*or_semantic*/ true)==last; + using namespace pstl::internal; + return pattern_adjacent_find(first, last, reorder_pred(comp), + is_parallelization_preferred(exec), + is_vectorization_preferred(exec), /*or_semantic*/ true)==last; } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy is_sorted(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last) { typedef typename iterator_traits::value_type input_type; return is_sorted(exec, first, last, std::less()); @@ -659,15 +694,16 @@ is_sorted(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last) { // [alg.nth.element] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy nth_element(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator nth, RandomAccessIterator last, Compare comp) { - __icp_algorithm::pattern_nth_element(first, nth, last, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + pattern_nth_element(first, nth, last, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy nth_element(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator nth, RandomAccessIterator last) { typedef typename iterator_traits::value_type input_type; nth_element(exec, first, nth, last, std::less()); @@ -675,47 +711,50 @@ nth_element(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIter // [alg.merge] template< class ExecutionPolicy, class InputIterator1, class InputIterator2, class OutputIterator, class Compare> -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy merge(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator d_first, Compare comp) { - return __icp_algorithm::pattern_merge(first1, last1, first2, last2, d_first, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_merge(first1, last1, first2, last2, d_first, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template< class ExecutionPolicy, class InputIterator1, class InputIterator2, class OutputIterator> -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy merge(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator d_first) { typedef typename iterator_traits::value_type value_type; return merge(exec, first1, last1, first2, last2, d_first, std::less()); } template< class ExecutionPolicy, class BidirectionalIterator, class Compare> -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy inplace_merge(ExecutionPolicy&& exec, BidirectionalIterator first, BidirectionalIterator middle, BidirectionalIterator last, Compare comp) { - __icp_algorithm::pattern_inplace_merge(first, middle, last, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + pattern_inplace_merge(first, middle, last, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template< class ExecutionPolicy, class BidirectionalIterator> -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy inplace_merge(ExecutionPolicy&& exec, BidirectionalIterator first, BidirectionalIterator middle, BidirectionalIterator last) { - typedef typename std::iterator_traits::value_type input_type; + typedef typename iterator_traits::value_type input_type; inplace_merge(exec, first, middle, last, std::less()); } // [includes] template< class ExecutionPolicy, class InputIterator1, class InputIterator2, class Compare> -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy includes(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, Compare comp) { - return __icp_algorithm::pattern_includes(first1, last1, first2, last2, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_includes(first1, last1, first2, last2, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template< class ExecutionPolicy, class InputIterator1, class InputIterator2> -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy includes(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2) { typedef typename iterator_traits::value_type value_type; return includes(exec, first1, last1, first2, last2, std::less()); @@ -724,15 +763,16 @@ includes(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, In // [set.union] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy set_union(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp) { - return __icp_algorithm::pattern_set_union(first1, last1, first2, last2, result, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_set_union(first1, last1, first2, last2, result, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy set_union(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result) { typedef typename iterator_traits::value_type value_type; @@ -742,15 +782,16 @@ set_union(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, I // [set.intersection] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy set_intersection(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp) { - return __icp_algorithm::pattern_set_intersection(first1, last1, first2, last2, result, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_set_intersection(first1, last1, first2, last2, result, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy set_intersection(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result) { typedef typename iterator_traits::value_type value_type; return set_intersection(exec, first1, last1, first2, last2, result, std::less()); @@ -759,15 +800,16 @@ set_intersection(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 l // [set.difference] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy set_difference(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp) { - return __icp_algorithm::pattern_set_difference(first1, last1, first2, last2, result, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_set_difference(first1, last1, first2, last2, result, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy set_difference(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result) { typedef typename iterator_traits::value_type value_type; return set_difference(exec, first1, last1, first2, last2, result, std::less()); @@ -776,15 +818,16 @@ set_difference(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 las // [set.symmetric.difference] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy set_symmetric_difference(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result, Compare comp) { - return __icp_algorithm::pattern_set_symmetric_difference(first1, last1, first2, last2, result, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_set_symmetric_difference(first1, last1, first2, last2, result, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy set_symmetric_difference(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, OutputIterator result) { typedef typename iterator_traits::value_type value_type; return set_symmetric_difference(exec, first1, last1, first2, last2, result, std::less()); @@ -792,75 +835,79 @@ set_symmetric_difference(ExecutionPolicy&& exec, InputIterator1 first1, InputIte // [is.heap] template< class ExecutionPolicy, class RandomAccessIterator, class Compare > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy is_heap_until(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp) { - return __icp_algorithm::pattern_is_heap_until(first, last, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_is_heap_until(first, last, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template< class ExecutionPolicy, class RandomAccessIterator > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy is_heap_until(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last) { - typedef typename std::iterator_traits::value_type input_type; + typedef typename iterator_traits::value_type input_type; return is_heap_until(exec, first, last, std::less()); } template< class ExecutionPolicy, class RandomAccessIterator, class Compare > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy is_heap(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last, Compare comp) { return is_heap_until(exec, first, last, comp) == last; } template< class ExecutionPolicy, class RandomAccessIterator > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy is_heap(ExecutionPolicy&& exec, RandomAccessIterator first, RandomAccessIterator last) { - typedef typename std::iterator_traits::value_type input_type; + typedef typename iterator_traits::value_type input_type; return is_heap(exec, first, last, std::less()); } // [alg.min.max] template< class ExecutionPolicy, class ForwardIterator, class Compare > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy min_element(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, Compare comp) { - return __icp_algorithm::pattern_min_element(first, last, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_min_element(first, last, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template< class ExecutionPolicy, class ForwardIterator > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy min_element(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last) { - typedef typename std::iterator_traits::value_type input_type; + typedef typename iterator_traits::value_type input_type; return min_element(exec, first, last, std::less()); } template< class ExecutionPolicy, class ForwardIterator, class Compare > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy max_element(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, Compare comp) { - return __icp_algorithm::pattern_max_element(first, last, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_min_element(first, last, pstl::internal::reorder_pred(comp), + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template< class ExecutionPolicy, class ForwardIterator > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy max_element(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last) { - typedef typename std::iterator_traits::value_type input_type; - return max_element(exec, first, last, std::less()); + typedef typename iterator_traits::value_type input_type; + return min_element(exec, first, last, pstl::internal::reorder_pred >(std::less())); } template< class ExecutionPolicy, class ForwardIterator, class Compare > -__icp_algorithm::enable_if_execution_policy> +pstl::internal::enable_if_execution_policy> minmax_element(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, Compare comp) { - return __icp_algorithm::pattern_minmax_element(first, last, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_minmax_element(first, last, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template< class ExecutionPolicy, class ForwardIterator > -__icp_algorithm::enable_if_execution_policy> +pstl::internal::enable_if_execution_policy> minmax_element(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last) { typedef typename iterator_traits::value_type value_type; return minmax_element(exec, first, last, std::less()); @@ -869,15 +916,16 @@ minmax_element(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator la // [alg.lex.comparison] template< class ExecutionPolicy, class InputIterator1, class InputIterator2, class Compare > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy lexicographical_compare(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, Compare comp) { - return __icp_algorithm::pattern_lexicographical_compare(first1, last1, first2, last2, comp, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_lexicographical_compare(first1, last1, first2, last2, comp, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template< class ExecutionPolicy, class InputIterator1, class InputIterator2 > -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy lexicographical_compare(ExecutionPolicy&& policy, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2) { typedef typename iterator_traits::value_type value_type; return lexicographical_compare(policy, first1, last1, first2, last2, std::less()); diff --git a/include/pstl/execution b/include/pstl/execution index 300ff2d25c5..1ee69551bc9 100644 --- a/include/pstl/execution +++ b/include/pstl/execution @@ -22,7 +22,7 @@ #define __PSTL_execution_policy_H #include -#include "_internal/pstl_config.h" +#include "internal/pstl_config.h" namespace pstl { namespace execution { diff --git a/include/pstl/_internal/algorithm_impl.h b/include/pstl/internal/algorithm_impl.h similarity index 73% rename from include/pstl/_internal/algorithm_impl.h rename to include/pstl/internal/algorithm_impl.h index 4e4a07708a5..b24191f5344 100644 --- a/include/pstl/_internal/algorithm_impl.h +++ b/include/pstl/internal/algorithm_impl.h @@ -28,21 +28,16 @@ #include #include "execution_policy_impl.h" +#include "simd_impl.h" -namespace __icp_algorithm { -//------------------------------------------------------------------------ -// forward -//------------------------------------------------------------------------ -template -void parallel_for(Index first, Index last, F f); -template -Index parallel_first(Index first, Index last, Brick f); -template -bool parallel_or(Index first, Index last, Brick f); -template -void parallel_strict_scan(Index n, T initial, R reduce, C combine, S scan, A apex); -template -void parallel_stable_sort(RandomAccessIterator xs, RandomAccessIterator xe, Compare comp, LeafSort leaf_sort); +#if __PSTL_USE_TBB + #include "parallel_impl_tbb.h" +#else + __PSTL_PRAGMA_MESSAGE("Backend was not specified"); +#endif + +namespace pstl { +namespace internal { //------------------------------------------------------------------------ // any_of @@ -66,15 +61,32 @@ bool pattern_any_of( InputIterator first, InputIterator last, Pred pred, IsVecto template bool pattern_any_of( InputIterator first, InputIterator last, Pred pred, IsVector is_vector, /*parallel=*/std::true_type ) { - return parallel_or( first, last, - [pred, is_vector](InputIterator i, InputIterator j) {return brick_any_of(i, j, pred, is_vector);} ); + return except_handler([=]() { + return par_backend::parallel_or( first, last, + [pred, is_vector](InputIterator i, InputIterator j) {return brick_any_of(i, j, pred, is_vector);} ); + }); +} + + +// [alg.foreach] +// for_each_n with no policy + +template +InputIterator for_each_n_serial(InputIterator first, Size n, Function f) { + for(; n > 0; ++first, --n) + f(first); + return first; } +template +InputIterator for_each_n(InputIterator first, Size n, Function f) { + return for_each_n_serial(first, n, [&f](InputIterator it) { f(*it); }); +} //------------------------------------------------------------------------ // walk1 (pseudo) // -// walk1 evaluates f(x) for each x drawn from [first,last) +// walk1 evaluates f(x) for each dereferenced value x drawn from [first,last) //------------------------------------------------------------------------ template void brick_walk1( Iterator first, Iterator last, Function f, /*vector=*/std::false_type ) noexcept { @@ -82,12 +94,6 @@ void brick_walk1( Iterator first, Iterator last, Function f, /*vector=*/std::fal f(*first); } -template -void brick_walk1( T* __restrict first, T* __restrict last, Function f, /*vector=*/std::false_type ) noexcept { - for(; first!=last; ++first ) - f(*first); -} - template void brick_walk1( Iterator first, Iterator last, Function f, /*vector=*/std::true_type ) noexcept { simd_walk_1(first, last-first, f); @@ -101,41 +107,69 @@ void pattern_walk1( Iterator first, Iterator last, Function f, IsVector is_vecto template void pattern_walk1( Iterator first, Iterator last, Function f, IsVector is_vector, /*parallel=*/std::true_type ) { - parallel_for( first, last, [f,is_vector](Iterator i, Iterator j) { - brick_walk1(i,j,f,is_vector); + except_handler([=]() { + par_backend::parallel_for( first, last, [f,is_vector](Iterator i, Iterator j) { + brick_walk1(i,j,f,is_vector); + }); }); } +template +void pattern_walk_brick( Iterator first, Iterator last, Brick brick, /*parallel=*/std::false_type ) noexcept { + brick(first, last); +} -// [alg.foreach] -// for_each_n with no policy -template -InputIterator for_each_n(InputIterator first, Size n, Function f) { - for( ; n > 0; ++first, --n ) - f(*first); - return first; +template +void pattern_walk_brick( Iterator first, Iterator last, Brick brick, /*parallel=*/std::true_type ) { + except_handler([=]() { + par_backend::parallel_for( first, last, [brick](Iterator i, Iterator j) { + brick(i,j); + }); + }); +} + + +//------------------------------------------------------------------------ +// it_walk1 (pseudo) +// +// it_walk1 evaluates f(it) for each iterator it drawn from [first,last) +//------------------------------------------------------------------------ +template +void brick_it_walk1( Iterator first, Iterator last, Function f, /*vector=*/std::false_type ) noexcept { + for(; first!=last; ++first ) + f(first); +} + +template +void brick_it_walk1( Iterator first, Iterator last, Function f, /*vector=*/std::true_type ) noexcept { + simd_it_walk_1(first, last-first, f); +} + +template +void pattern_it_walk1( Iterator first, Iterator last, Function f, IsVector is_vector, /*parallel=*/std::false_type ) noexcept { + brick_it_walk1( first, last, f, is_vector ); +} + +template +void pattern_it_walk1( Iterator first, Iterator last, Function f, IsVector is_vector, /*parallel=*/std::true_type ) { + except_handler([=]() { + par_backend::parallel_for( first, last, [f,is_vector](Iterator i, Iterator j) { + brick_it_walk1(i,j,f,is_vector); + }); + }); } //------------------------------------------------------------------------ // walk1_n //------------------------------------------------------------------------ template -InputIterator brick_walk1_n(InputIterator first, Size n, Function f, - /*IsVectorTag=*/std::false_type ) { +InputIterator brick_walk1_n(InputIterator first, Size n, Function f, /*IsVectorTag=*/std::false_type ) { return for_each_n( first, n, f ); // calling serial version } -template -RandomAccessIterator brick_walk1_n( RandomAccessIterator first, Size n, Function f, - /*vectorTag=*/std::true_type ) noexcept(noexcept(f(first[0]))) { - RandomAccessIterator last = first + n; - RandomAccessIterator begin = first < last ? first : last; - RandomAccessIterator end = first < last ? last : first; - Size positive_n = end - begin; -__PSTL_PRAGMA_SIMD - for( Size i = 0; i < positive_n; ++i ) - f( begin[i] ); - return end; +template +RandomAccessIterator brick_walk1_n( RandomAccessIterator first, DifferenceType n, Function f, /*vectorTag=*/std::true_type ) noexcept { + return simd_walk_1(first, n, f); } template @@ -145,24 +179,57 @@ InputIterator pattern_walk1_n( InputIterator first, Size n, Function f, IsVector template RandomAccessIterator pattern_walk1_n( RandomAccessIterator first, Size n, Function f, IsVector is_vector, /*is_parallel=*/std::true_type ) { - RandomAccessIterator last = first + n; - parallel_for( first, last, - [ &f, is_vector ]( RandomAccessIterator first, RandomAccessIterator last ) { - brick_walk1_n( first, last - first, f, is_vector ); - } ); - return last; + pattern_walk1(first, first + n, f, is_vector, std::true_type()); + return first + n; +} + +template +InputIterator pattern_walk_brick_n( InputIterator first, Size n, Brick brick, /*is_parallel=*/std::false_type ) noexcept { + return brick(first, n); +} + +template +RandomAccessIterator pattern_walk_brick_n( RandomAccessIterator first, Size n, Brick brick, /*is_parallel=*/std::true_type ) { + return except_handler([=]() { + par_backend::parallel_for(first, first + n, [brick](RandomAccessIterator i, RandomAccessIterator j) { + brick(i, j-i); + }); + return first + n; + }); +} + + + +template +InputIterator brick_it_walk1_n(InputIterator first, Size n, Function f, /*IsVectorTag=*/std::false_type ) { + return for_each_n_serial(first, n, f); // calling serial version +} + +template +RandomAccessIterator brick_it_walk1_n( RandomAccessIterator first, DifferenceType n, Function f, /*vectorTag=*/std::true_type ) noexcept { + return simd_it_walk_1(first, n, f); +} + +template +InputIterator pattern_it_walk1_n( InputIterator first, Size n, Function f, IsVector is_vector, /*is_parallel=*/std::false_type ) noexcept { + return brick_it_walk1_n(first, n, f, is_vector); } +template +RandomAccessIterator pattern_it_walk1_n( RandomAccessIterator first, Size n, Function f, IsVector is_vector, /*is_parallel=*/std::true_type ) { + pattern_it_walk1(first, first + n, f, is_vector, std::true_type()); + return first + n; +} //------------------------------------------------------------------------ // walk2 (pseudo) // -// walk2 evaluates f(x,y) for (x,y) drawn from [first1,last1) and [first2,...) +// walk2 evaluates f(x,y) for deferenced values (x,y) drawn from [first1,last1) and [first2,...) //------------------------------------------------------------------------ template Iterator2 brick_walk2( Iterator1 first1, Iterator1 last1, Iterator2 first2, Function f, /*vector=*/std::false_type ) noexcept { for(; first1!=last1; ++first1, ++first2 ) - f(*first1,*first2); + f(*first1, *first2); return first2; } @@ -171,6 +238,19 @@ Iterator2 brick_walk2( Iterator1 first1, Iterator1 last1, Iterator2 first2, Func return simd_walk_2(first1, last1-first1, first2, f); } +template +Iterator2 brick_walk2_n( Iterator1 first1, Size n, Iterator2 first2, Function f, /*vector=*/std::false_type ) noexcept { + for(; n > 0; --n, ++first1, ++first2 ) + f(*first1, *first2); + return first2; +} + +template +Iterator2 brick_walk2_n(Iterator1 first1, Size n, Iterator2 first2, Function f, /*vector=*/std::true_type) noexcept { + return simd_walk_2(first1, n, first2, f); +} + + template Iterator2 pattern_walk2( Iterator1 first1, Iterator1 last1, Iterator2 first2, Function f, IsVector is_vector, /*parallel=*/std::false_type ) noexcept { @@ -179,16 +259,121 @@ Iterator2 pattern_walk2( Iterator1 first1, Iterator1 last1, Iterator2 first2, Fu template Iterator2 pattern_walk2(Iterator1 first1, Iterator1 last1, Iterator2 first2, Function f, IsVector is_vector, /*parallel=*/std::true_type ) { - parallel_for( - first1, last1, - [f,first1,first2,is_vector](Iterator1 i, Iterator1 j) { - brick_walk2(i,j,first2+(i-first1),f,is_vector); - } - ); - return first2+(last1-first1); + return except_handler([=]() { + par_backend::parallel_for( + first1, last1, + [f,first1,first2,is_vector](Iterator1 i, Iterator1 j) { + brick_walk2(i,j,first2+(i-first1),f,is_vector); + } + ); + return first2+(last1-first1); + }); +} + +template +Iterator2 pattern_walk2_n( Iterator1 first1, Size n, Iterator2 first2, Function f, IsVector is_vector, /*parallel=*/std::false_type ) noexcept { + return brick_walk2_n(first1, n, first2, f, is_vector); +} + +template +Iterator2 pattern_walk2_n(Iterator1 first1, Size n, Iterator2 first2, Function f, IsVector is_vector, /*parallel=*/std::true_type ) { + return pattern_walk2(first1, first1 + n, first2, f, is_vector, std::true_type()); +} + +template +Iterator2 pattern_walk2_brick( Iterator1 first1, Iterator1 last1, Iterator2 first2, Brick brick, /*parallel=*/std::false_type ) noexcept { + return brick(first1,last1,first2); +} + +template +Iterator2 pattern_walk2_brick(Iterator1 first1, Iterator1 last1, Iterator2 first2, Brick brick, /*parallel=*/std::true_type ) { + return except_handler([=]() { + par_backend::parallel_for( + first1, last1, + [first1,first2, brick](Iterator1 i, Iterator1 j) { + brick(i,j,first2+(i-first1)); + } + ); + return first2+(last1-first1); + }); +} + +template +Iterator2 pattern_walk2_brick_n(Iterator1 first1, Size n, Iterator2 first2, Brick brick, /*parallel=*/std::true_type ) { + return except_handler([=]() { + par_backend::parallel_for( + first1, first1+n, + [first1,first2, brick](Iterator1 i, Iterator1 j) { + brick(i, j-i, first2+(i-first1)); + } + ); + return first2 + n; + }); +} + +template +Iterator2 pattern_walk2_brick_n( Iterator1 first1, Size n, Iterator2 first2, Brick brick, /*parallel=*/std::false_type ) noexcept { + return brick(first1, n, first2); } +//------------------------------------------------------------------------ +// it_walk2 (pseudo) +// +// it_walk2 evaluates f(it1, it2) for iterators (it1, it2) drawn from [first1,last1) and [first2,...) +//------------------------------------------------------------------------ +template +Iterator2 brick_it_walk2( Iterator1 first1, Iterator1 last1, Iterator2 first2, Function f, /*vector=*/std::false_type ) noexcept { + for(; first1!=last1; ++first1, ++first2 ) + f(first1, first2); + return first2; +} + +template +Iterator2 brick_it_walk2( Iterator1 first1, Iterator1 last1, Iterator2 first2, Function f, /*vector=*/std::true_type) noexcept { + return simd_it_walk_2(first1, last1-first1, first2, f); +} + +template +Iterator2 brick_it_walk2_n( Iterator1 first1, Size n, Iterator2 first2, Function f, /*vector=*/std::false_type ) noexcept { + for(; n > 0; --n, ++first1, ++first2 ) + f(first1, first2); + return first2; +} + +template +Iterator2 brick_it_walk2_n(Iterator1 first1, Size n, Iterator2 first2, Function f, /*vector=*/std::true_type) noexcept { + return simd_it_walk_2(first1, n, first2, f); +} + +template +Iterator2 pattern_it_walk2( Iterator1 first1, Iterator1 last1, Iterator2 first2, Function f, IsVector is_vector, /*parallel=*/std::false_type ) noexcept { + return brick_it_walk2(first1,last1,first2,f,is_vector); +} + +template +Iterator2 pattern_it_walk2(Iterator1 first1, Iterator1 last1, Iterator2 first2, Function f, IsVector is_vector, /*parallel=*/std::true_type ) { + return except_handler([=]() { + par_backend::parallel_for( + first1, last1, + [f,first1,first2,is_vector](Iterator1 i, Iterator1 j) { + brick_it_walk2(i,j,first2+(i-first1),f,is_vector); + } + ); + return first2+(last1-first1); + }); +} + +template +Iterator2 pattern_it_walk2_n( Iterator1 first1, Size n, Iterator2 first2, Function f, IsVector is_vector, /*parallel=*/std::false_type ) noexcept { + return brick_it_walk2_n(first1, n, first2, f, is_vector); +} + +template +Iterator2 pattern_it_walk2_n(Iterator1 first1, Size n, Iterator2 first2, Function f, IsVector is_vector, /*parallel=*/std::true_type ) { + return pattern_it_walk2(first1, first1 + n, first2, f, is_vector, std::true_type()); +} + //------------------------------------------------------------------------ // walk3 (pseudo) // @@ -214,13 +399,14 @@ Iterator3 pattern_walk3( Iterator1 first1, Iterator1 last1, Iterator2 first2, It template Iterator3 pattern_walk3(Iterator1 first1, Iterator1 last1, Iterator2 first2, Iterator3 first3, Function f, IsVector is_vector, /*parallel=*/std::true_type ) { - parallel_for( - first1, last1, - [f, first1, first2, first3, is_vector](Iterator1 i, Iterator1 j) { - brick_walk3(i, j, first2+(i-first1), first3+(i-first1), f, is_vector); - } - ); - return first3+(last1-first1); + return except_handler([=]() { + par_backend::parallel_for( + first1, last1, + [f, first1, first2, first3, is_vector](Iterator1 i, Iterator1 j) { + brick_walk3(i, j, first2+(i-first1), first3+(i-first1), f, is_vector); + }); + return first3+(last1-first1); + }); } @@ -244,39 +430,19 @@ InputIterator pattern_find_if( InputIterator first, InputIterator last, Predicat template InputIterator pattern_find_if( InputIterator first, InputIterator last, Predicate pred, IsVector is_vector, /*is_parallel=*/std::true_type ) { - return parallel_first( first, last, [pred,is_vector](InputIterator i, InputIterator j) { - return brick_find_if(i,j,pred,is_vector); - }); + return except_handler([=]() { + return par_backend::parallel_first( first, last, [pred,is_vector](InputIterator i, InputIterator j) { + return brick_find_if(i,j,pred,is_vector); + }); + }); } //------------------------------------------------------------------------ // find_end //------------------------------------------------------------------------ -template -ForwardIt1 search_serial(ForwardIt1 first, ForwardIt1 last, ForwardIt2 s_first, ForwardIt2 s_last, BinaryPredicate p, bool b_first) { - if(s_first == s_last) - return last; - - ForwardIt1 result = last; - for(; first != last; ++first) { - auto it1 = first; - auto it2 = s_first; - for(; it2 != s_last && it1 != last; ++it2, ++it1) { - if(!p(*it1, *it2)) - break; - } - if(it2 == s_last) {//subsequence was found - result = first; - if(b_first) //first occurrence semantic - break; - } - } - return result; -} - template ForwardIterator1 brick_find_end(ForwardIterator1 first, ForwardIterator1 last, ForwardIterator2 s_first, ForwardIterator2 s_last, BinaryPredicate pred, /*is_vector=*/std::false_type) noexcept { - return search_serial(first, last, s_first, s_last, pred, false); + return std::find_end(first, last, s_first, s_last, pred); } template @@ -325,7 +491,7 @@ InputIterator pattern_find_first_of(InputIterator first, InputIterator last, For //------------------------------------------------------------------------ template ForwardIterator1 brick_search(ForwardIterator1 first, ForwardIterator1 last, ForwardIterator2 s_first, ForwardIterator2 s_last, BinaryPredicate pred, /*vector=*/std::false_type) noexcept { - return search_serial(first, last, s_first, s_last, pred, true); + return std::search(first, last, s_first, s_last, pred); } template @@ -379,23 +545,10 @@ OutputIterator brick_copy_n(InputIterator first, Size n, OutputIterator result, template OutputIterator brick_copy_n(InputIterator first, Size n, OutputIterator result, /*vector=*/std::true_type) noexcept { - return simd_copy_n(first, n, result); -} - -template -OutputIterator pattern_copy_n(InputIterator first, Size n, OutputIterator result, IsVector is_vector, /*parallel=*/std::false_type) noexcept { - return brick_copy_n(first, n, result, is_vector); -} - -template -OutputIterator pattern_copy_n(InputIterator first, Size n, OutputIterator result, IsVector is_vector, /*parallel=*/std::true_type) { - parallel_for( - Size(0), n, - [first,result,is_vector](Size i, Size j) { - brick_copy_n(first+i, j-i, result+i, is_vector); - } - ); - return result+n; + return simd_copy_move(first, n, result, + [](InputIterator first, OutputIterator result) { + *result = *first; + }); } //------------------------------------------------------------------------ @@ -408,17 +561,26 @@ OutputIterator brick_copy(InputIterator first, InputIterator last, OutputIterato template OutputIterator brick_copy(InputIterator first, InputIterator last, OutputIterator result, /*vector=*/std::true_type) noexcept { - return brick_copy_n(first, last - first, result, std::true_type()); + return simd_copy_move(first, last - first, result, + [](InputIterator first, OutputIterator result) { + *result = *first; + }); } -template -OutputIterator pattern_copy(InputIterator first, InputIterator last, OutputIterator result, IsVector is_vector, /*parallel=*/std::false_type) noexcept { - return brick_copy(first, last, result, is_vector); +//------------------------------------------------------------------------ +// move +//------------------------------------------------------------------------ +template +OutputIterator brick_move(InputIterator first, InputIterator last, OutputIterator result, /*vector=*/std::false_type) noexcept { + return std::move(first, last, result); } -template -OutputIterator pattern_copy(InputIterator first, InputIterator last, OutputIterator result, IsVector is_vector, /*parallel=*/std::true_type) { - return pattern_copy_n(first, last - first, result, is_vector, std::true_type()); +template +OutputIterator brick_move(InputIterator first, InputIterator last, OutputIterator result, /*vector=*/std::true_type) noexcept { + return simd_copy_move(first, last - first, result, + [](InputIterator first, OutputIterator result) { + *result = std::move(*first); + }); } //------------------------------------------------------------------------ @@ -440,24 +602,32 @@ OutputIterator brick_copy_if(InputIterator first, InputIterator last, OutputIter // TODO: Try to use transform_reduce for combining brick_copy_if_phase1 on IsVector. template -DifferenceType brick_calc_mask_1(InputIterator first, InputIterator last, bool* __restrict mask, UnaryPredicate pred, /*vector=*/std::false_type) noexcept { - DifferenceType count = 0; +std::pair brick_calc_mask_1( + InputIterator first, InputIterator last, bool* __restrict mask, UnaryPredicate pred, /*vector=*/std::false_type) noexcept { + auto count_true = DifferenceType(0); + auto count_false = DifferenceType(0); + auto size = std::distance(first, last); + for (; first != last; ++first, ++mask) { *mask = pred(*first); - count += *mask; + if (*mask) { + ++count_true; + } } - return count; + return std::make_pair(count_true, size - count_true); } template -DifferenceType brick_calc_mask_1(InputIterator first, InputIterator last, bool* __restrict mask, UnaryPredicate pred, /*vector=*/std::true_type) noexcept { - return simd_calc_mask_1(first, last-first, mask, pred); +std::pair brick_calc_mask_1( + InputIterator first, InputIterator last, bool* __restrict mask, UnaryPredicate pred, /*vector=*/std::true_type) noexcept { + auto result = simd_calc_mask_1(first, last - first, mask, pred); + return std::make_pair(result, (last - first) - result); } template -void brick_copy_by_mask(InputIterator first, InputIterator last, OutputIterator result, bool* mask, /*vector=*/std::false_type ) noexcept { - for(;first!=last; ++first, ++mask) { - if( *mask ) { +void brick_copy_by_mask(InputIterator first, InputIterator last, OutputIterator result, bool* mask, /*vector=*/std::false_type) noexcept { + for (; first != last; ++first, ++mask) { + if (*mask) { *result = *first; ++result; } @@ -467,13 +637,39 @@ void brick_copy_by_mask(InputIterator first, InputIterator last, OutputIterator template void brick_copy_by_mask(InputIterator first, InputIterator last, OutputIterator result, bool* __restrict mask, /*vector=*/std::true_type) noexcept { #if (__PSTL_MONOTONIC_PRESENT) - simd_copy_by_mask(first, last-first, result, mask); + simd_copy_by_mask(first, last - first, result, mask); #else brick_copy_by_mask(first, last, result, mask, std::false_type()); #endif } +template +void brick_partition_by_mask(InputIterator first, InputIterator last, OutputIterator1 out_true, + OutputIterator2 out_false, bool* mask, /*vector=*/std::false_type) noexcept { + for (; first != last; ++first, ++mask) { + if (*mask) { + *out_true = *first; + ++out_true; + } + else { + *out_false = *first; + ++out_false; + } + } +} + +template +void brick_partition_by_mask(InputIterator first, InputIterator last, OutputIterator1 out_true, + OutputIterator2 out_false, bool* mask, /*vector=*/std::true_type) noexcept { +#if (__PSTL_MONOTONIC_PRESENT) + simd_partition_by_mask(first, last - first, out_true, out_false, mask); +#else + brick_partition_by_mask(first, last, out_true, out_false, mask, std::false_type()); +#endif + +} + template OutputIterator pattern_copy_if(InputIterator first, InputIterator last, OutputIterator result, UnaryPredicate pred, IsVector is_vector, /*parallel=*/std::false_type) noexcept { return brick_copy_if(first, last, result, pred, is_vector); @@ -482,28 +678,30 @@ OutputIterator pattern_copy_if(InputIterator first, InputIterator last, OutputIt template OutputIterator pattern_copy_if(InputIterator first, InputIterator last, OutputIterator result, UnaryPredicate pred, IsVector is_vector, /*parallel=*/std::true_type) { typedef typename std::iterator_traits::difference_type difference_type; - difference_type n = last-first; + const difference_type n = last-first; if( difference_type(1) < n ) { - raw_buffer mask_buf(n*sizeof(bool)); + par_backend::raw_buffer mask_buf(n*sizeof(bool)); if( mask_buf ) { - bool* mask = static_cast(mask_buf.get()); - difference_type m; - parallel_strict_scan( n, difference_type(0), - [=](difference_type i, difference_type len) { // Reduce - return brick_calc_mask_1(first+i, first+(i+len), + return except_handler([n, first, last, result, is_vector, pred, &mask_buf]() { + bool* mask = static_cast(mask_buf.get()); + difference_type m; + par_backend::parallel_strict_scan( n, difference_type(0), + [=](difference_type i, difference_type len) { // Reduce + return brick_calc_mask_1(first+i, first+(i+len), mask + i, pred, - is_vector); - }, - std::plus(), // Combine - [=](difference_type i, difference_type len, difference_type initial) { // Scan - brick_copy_by_mask(first+i, first+(i+len), - result+initial, - mask + i, - is_vector); - }, - [&m](difference_type total) {m=total;}); - return result + m; + is_vector).first; + }, + std::plus(), // Combine + [=](difference_type i, difference_type len, difference_type initial) { // Scan + brick_copy_by_mask(first+i, first+(i+len), + result+initial, + mask + i, + is_vector); + }, + [&m](difference_type total) {m=total;}); + return result + m; + }); } } // Out of memory or trivial sequence - use serial algorithm @@ -577,39 +775,41 @@ DifferenceType brick_calc_mask_2(InputIterator first, InputIterator last, bool* template OutputIterator pattern_unique_copy(InputIterator first, InputIterator last, OutputIterator result, BinaryPredicate pred, IsVector is_vector, /*parallel=*/std::true_type) { typedef typename std::iterator_traits::difference_type difference_type; - difference_type n = last-first; + const difference_type n = last-first; if( difference_type(2)(mask_buf.get()); - difference_type m; - parallel_strict_scan( n, difference_type(0), - [=](difference_type i, difference_type len) -> difference_type { // Reduce - difference_type extra = 0; - if( i==0 ) { - // Special boundary case - mask[i] = true; - if( --len==0 ) return 1; - ++i; - ++extra; - } - return brick_calc_mask_2( - first+i, first+(i+len), - mask + i, - pred, - is_vector) + extra; - }, - std::plus(), // Combine - [=](difference_type i, difference_type len, difference_type initial) { // Scan - // Phase 2 is same as for pattern_copy_if - brick_copy_by_mask( - first+i, first+(i+len), - result+initial, - mask + i, - is_vector); - }, - [&m](difference_type total) {m=total;}); - return result + m; + return except_handler([n, first, result, pred, is_vector, &mask_buf]() { + bool* mask = static_cast(mask_buf.get()); + difference_type m; + par_backend::parallel_strict_scan( n, difference_type(0), + [=](difference_type i, difference_type len) -> difference_type { // Reduce + difference_type extra = 0; + if( i==0 ) { + // Special boundary case + mask[i] = true; + if( --len==0 ) return 1; + ++i; + ++extra; + } + return brick_calc_mask_2( + first+i, first+(i+len), + mask + i, + pred, + is_vector) + extra; + }, + std::plus(), // Combine + [=](difference_type i, difference_type len, difference_type initial) { // Scan + // Phase 2 is same as for pattern_copy_if + brick_copy_by_mask( + first+i, first+(i+len), + result+initial, + mask + i, + is_vector); + }, + [&m](difference_type total) {m=total;}); + return result + m; + }); } } // Out of memory or trivial sequence - use serial algorithm @@ -626,7 +826,7 @@ ForwardIterator2 brick_swap_ranges(ForwardIterator1 first1, ForwardIterator1 las } template -ForwardIterator2 brick_swap_ranges(ForwardIterator1 first1, ForwardIterator2 last1, ForwardIterator2 first2, /*is_vector=*/std::true_type) noexcept { +ForwardIterator2 brick_swap_ranges(ForwardIterator1 first1, ForwardIterator1 last1, ForwardIterator2 first2, /*is_vector=*/std::true_type) noexcept { __PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); return std::swap_ranges(first1, last1, first2); } @@ -723,15 +923,13 @@ OutputIterator pattern_reverse_copy(BidirectionalIterator first, BidirectionalIt //------------------------------------------------------------------------ // rotate //------------------------------------------------------------------------ - template ForwardIterator brick_rotate(ForwardIterator first, ForwardIterator middle, ForwardIterator last, /*is_vector=*/std::false_type) noexcept { - #if __PSTL_CPP11_STD_ROTATE_BROKEN - std::rotate(first, middle, last); + std::rotate(first, middle, last); return std::next(first, std::distance(middle, last)); #else - return std::rotate(first, middle, last); + return std::rotate(first, middle, last); #endif } @@ -871,8 +1069,11 @@ brick_partition_copy(InputIterator first, InputIterator last, OutputIterator1 ou template std::pair brick_partition_copy(InputIterator first, InputIterator last, OutputIterator1 out_true, OutputIterator2 out_false, UnaryPredicate pred, /*is_vector=*/std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); - return brick_partition_copy(first, last, out_true, out_false, pred, std::false_type()); +#if (__PSTL_MONOTONIC_PRESENT) + return simd_partition_copy(first, last - first, out_true, out_false, pred); +#else + return std::partition_copy(first, last, out_true, out_false, pred); +#endif } @@ -885,7 +1086,38 @@ pattern_partition_copy(InputIterator first, InputIterator last, OutputIterator1 template std::pair pattern_partition_copy(InputIterator first, InputIterator last, OutputIterator1 out_true, OutputIterator2 out_false, UnaryPredicate pred, IsVector is_vector, /*is_parallelization=*/std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); + typedef typename std::iterator_traits::difference_type difference_type; + typedef std::pair return_type; + const difference_type n = last - first; + if (difference_type(1) < n) { + par_backend::raw_buffer mask_buf(n * sizeof(bool)); + if (mask_buf) { + return except_handler([n, first, last, out_true, out_false, is_vector, pred, &mask_buf]() { + bool* mask = static_cast(mask_buf.get()); + return_type m; + par_backend::parallel_strict_scan(n, std::make_pair(difference_type(0), difference_type(0)), + [=](difference_type i, difference_type len) { // Reduce + return brick_calc_mask_1(first + i, first + (i + len), + mask + i, + pred, + is_vector); + }, + [](const return_type& x, const return_type& y)-> return_type { + return std::make_pair(x.first + y.first, x.second + y.second); + }, // Combine + [=](difference_type i, difference_type len, return_type initial) { // Scan + brick_partition_by_mask(first + i, first + (i + len), + out_true + initial.first, + out_false + initial.second, + mask + i, + is_vector); + }, + [&m](return_type total) {m = total; }); + return std::make_pair(out_true + m.first, out_false + m.second); + }); + } + } + // Out of memory or trivial sequence - use serial algorithm return brick_partition_copy(first, last, out_true, out_false, pred, is_vector); } @@ -901,9 +1133,11 @@ void pattern_sort(RandomAccessIterator first, RandomAccessIterator last, Compare template void pattern_sort(RandomAccessIterator first, RandomAccessIterator last, Compare comp, IsVector /*is_vector*/, /*is_parallel=*/std::true_type, /*is_move_constructible=*/std::true_type ) { - parallel_stable_sort(first, last, comp, - [](RandomAccessIterator first, RandomAccessIterator last, Compare comp) { - std::sort(first, last, comp); + except_handler([=]() { + par_backend::parallel_stable_sort(first, last, comp, + [](RandomAccessIterator first, RandomAccessIterator last, Compare comp) { + std::sort(first, last, comp); + }); }); } @@ -918,9 +1152,11 @@ void pattern_stable_sort(RandomAccessIterator first, RandomAccessIterator last, template void pattern_stable_sort(RandomAccessIterator first, RandomAccessIterator last, Compare comp, IsVector /*is_vector*/, /*is_parallel=*/std::true_type) { - parallel_stable_sort(first, last, comp, - [](RandomAccessIterator first, RandomAccessIterator last, Compare comp) { - std::stable_sort(first, last, comp); + except_handler([=]() { + par_backend::parallel_stable_sort(first, last, comp, + [](RandomAccessIterator first, RandomAccessIterator last, Compare comp) { + std::stable_sort(first, last, comp); + }); }); } @@ -987,7 +1223,7 @@ bool brick_equal(InputIterator1 first1, InputIterator1 last1, InputIterator2 fir template bool brick_equal(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate p, /* is_vector = */ std::true_type) noexcept { - return simd_first(first1, last1-first1, first2, __icp_algorithm::not_pred(p)) == last1; + return simd_first(first1, last1-first1, first2, not_pred(p)).first == last1; } template @@ -995,6 +1231,15 @@ bool pattern_equal(InputIterator1 first1, InputIterator1 last1, InputIterator2 f return brick_equal(first1, last1, first2, p, is_vector); } +template +bool pattern_equal(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate p, IsVector vec, /*is_parallel=*/std::true_type) { + return except_handler([=]() { + return !par_backend::parallel_or(first1, last1, + [first1, first2, p, vec](InputIterator1 i, InputIterator1 j) {return !brick_equal(i, j, + first2 + (i - first1), p, vec); }); + }); +} + //------------------------------------------------------------------------ // count //------------------------------------------------------------------------ @@ -1016,6 +1261,20 @@ pattern_count(InputIterator first, InputIterator last, Predicate pred, /* is_par return brick_count(first, last, pred, vec); } +template +typename std::iterator_traits::difference_type +pattern_count(InputIterator first, InputIterator last, Predicate pred, /* is_parallel */ std::true_type, IsVector vec) { + typedef typename std::iterator_traits::difference_type size_type; + return except_handler([=]() { + return par_backend::parallel_reduce(first, last, size_type(0), + [pred, vec](InputIterator begin, InputIterator end, size_type value)->size_type { + return value + brick_count(begin, end, pred, vec); + }, + std::plus() + ); + }); +} + //------------------------------------------------------------------------ // adjacent_find //------------------------------------------------------------------------ @@ -1034,6 +1293,42 @@ ForwardIt pattern_adjacent_find(ForwardIt first, ForwardIt last, BinaryPredicate return brick_adjacent_find(first, last, pred, vec, or_semantic); } +template +ForwardIt pattern_adjacent_find(ForwardIt first, ForwardIt last, BinaryPredicate pred, /* is_parallel */ std::true_type, IsVector vec, bool or_semantic) { + if (last - first < 2) + return last; + + return except_handler([=]() { + return par_backend::parallel_reduce(first, last, last, + [last, pred, vec, or_semantic](ForwardIt begin, ForwardIt end, ForwardIt value)->ForwardIt { + + // TODO: investigate performance benefits from the use of shared variable for the result, + // checking (compare_and_swap idiom) its value at first. + if (or_semantic && value < last) {//found + par_backend::cancel_execution(); + return value; + } + + if (value > begin) { + // modify end to check the predicate on the boundary values; + // TODO: to use a custom range with boundaries overlapping + // TODO: investigate what if we remove "if" below and run algorithm on range [first, last-1) + // then check the pair [last-1, last) + if (end != last) + ++end; + + //correct the global result iterator if the "brick" returns a local "last" + const ForwardIt res = brick_adjacent_find(begin, end, pred, vec, or_semantic); + if (res < end) + value = res; + } + return value; + }, + [](ForwardIt x, ForwardIt y)->ForwardIt { return x < y ? x : y; } //reduce a value + ); + }); +} + //------------------------------------------------------------------------ // nth_element //------------------------------------------------------------------------ @@ -1078,9 +1373,18 @@ void pattern_fill(ForwardIterator first, ForwardIterator last, const T& value, / brick_fill(first, last, value, vec); } +template +ForwardIterator pattern_fill(ForwardIterator first, ForwardIterator last, const T& value, /*is_parallel=*/std::true_type, IsVector vec) { + return except_handler([=]() { + par_backend::parallel_for(first, last, [&value, vec](ForwardIterator begin, ForwardIterator end) { + brick_fill(begin, end, value, vec); }); + return last; + }); +} + template OutputIterator brick_fill_n(OutputIterator first, Size count, const T& value, /* is_vector = */ std::true_type) noexcept { - return simd_fill_n(first, count, value);; + return simd_fill_n(first, count, value); } template @@ -1093,6 +1397,11 @@ OutputIterator pattern_fill_n(OutputIterator first, Size count, const T& value, return brick_fill_n(first, count, value, vec); } +template +OutputIterator pattern_fill_n(OutputIterator first, Size count, const T& value, /*is_parallel=*/std::true_type, IsVector vec) { + return pattern_fill(first, first + count, value, std::true_type(), vec); +} + //------------------------------------------------------------------------ // generate, generate_n //------------------------------------------------------------------------ @@ -1111,6 +1420,15 @@ void pattern_generate(ForwardIterator first, ForwardIterator last, Generator g, brick_generate(first, last, g, vec); } +template +ForwardIterator pattern_generate(ForwardIterator first, ForwardIterator last, Generator g, /*is_parallel=*/std::true_type, IsVector vec) { + return except_handler([=]() { + par_backend::parallel_for(first, last, [g, vec](ForwardIterator begin, ForwardIterator end) { + brick_generate(begin, end, g, vec); }); + return last; + }); +} + template OutputIterator brick_generate_n(OutputIterator first, Size count, Generator g, /* is_vector = */ std::true_type) noexcept { return simd_generate_n(first, count, g); @@ -1126,6 +1444,11 @@ OutputIterator pattern_generate_n(OutputIterator first, Size count, Generator g, return brick_generate_n(first, count, g, vec); } +template +OutputIterator pattern_generate_n(OutputIterator first, Size count, Generator g, /*is_parallel=*/std::true_type, IsVector vec) { + return pattern_generate(first, first + count, g, std::true_type(), vec); +} + //------------------------------------------------------------------------ // remove //------------------------------------------------------------------------ @@ -1365,78 +1688,80 @@ RandomAccessIterator pattern_is_heap_until(RandomAccessIterator first, RandomAcc // min_element //------------------------------------------------------------------------ -template +template ForwardIterator brick_min_element(ForwardIterator first, ForwardIterator last, Compare comp, /* is_vector = */ std::false_type) noexcept { return std::min_element(first, last, comp); } -template +template ForwardIterator brick_min_element(ForwardIterator first, ForwardIterator last, Compare comp, /* is_vector = */ std::true_type) noexcept { __PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); return std::min_element(first, last, comp); } -template +template ForwardIterator pattern_min_element(ForwardIterator first, ForwardIterator last, Compare comp, IsVector is_vector, /* is_parallel = */ std::false_type) noexcept { return brick_min_element(first, last, comp, is_vector); } -template +template ForwardIterator pattern_min_element(ForwardIterator first, ForwardIterator last, Compare comp, IsVector is_vector, /* is_parallel = */ std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_min_element(first, last, comp, is_vector); -} - -//------------------------------------------------------------------------ -// max_element -//------------------------------------------------------------------------ - -template -ForwardIterator brick_max_element(ForwardIterator first, ForwardIterator last, Compare comp, /* is_vector = */ std::false_type) noexcept { - return std::max_element(first, last, comp); -} - -template -ForwardIterator brick_max_element(ForwardIterator first, ForwardIterator last, Compare comp, /* is_vector = */ std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); - return std::max_element(first, last, comp); -} - -template -ForwardIterator pattern_max_element(ForwardIterator first, ForwardIterator last, Compare comp, IsVector is_vector, /* is_parallel = */ std::false_type) noexcept { - return brick_max_element(first, last, comp, is_vector); -} + if(first == last) + return last; -template -ForwardIterator pattern_max_element(ForwardIterator first, ForwardIterator last, Compare comp, IsVector is_vector, /* is_parallel = */ std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_max_element(first, last, comp, is_vector); + return except_handler([=]() { + return par_backend::parallel_reduce( + first + 1, last, first, + [=](ForwardIterator begin, ForwardIterator end, ForwardIterator init) -> ForwardIterator { + const ForwardIterator subresult = brick_min_element(begin, end, comp, is_vector); + return cmp_iterators_by_values(init, subresult, comp); + }, + [=](ForwardIterator it1, ForwardIterator it2) -> ForwardIterator { + return cmp_iterators_by_values(it1, it2, comp); + } + ); + }); } //------------------------------------------------------------------------ // minmax_element //------------------------------------------------------------------------ -template +template std::pair brick_minmax_element(ForwardIterator first, ForwardIterator last, Compare comp, /* is_vector = */ std::false_type) noexcept { return std::minmax_element(first, last, comp); } -template +template std::pair brick_minmax_element(ForwardIterator first, ForwardIterator last, Compare comp, /* is_vector = */ std::true_type) noexcept { __PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); return std::minmax_element(first, last, comp); } -template +template std::pair pattern_minmax_element(ForwardIterator first, ForwardIterator last, Compare comp, IsVector is_vector, /* is_parallel = */ std::false_type) noexcept { return brick_minmax_element(first, last, comp, is_vector); } -template +template std::pair pattern_minmax_element(ForwardIterator first, ForwardIterator last, Compare comp, IsVector is_vector, /* is_parallel = */ std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_minmax_element(first, last, comp, is_vector); + if(first == last) + return std::make_pair(first, first); + + return except_handler([=]() { + typedef std::pair result_t; + + return par_backend::parallel_reduce( + first + 1, last, std::make_pair(first, first), + [=](ForwardIterator begin, ForwardIterator end, result_t init) -> result_t { + const result_t subresult = brick_minmax_element(begin, end, comp, is_vector); + return std::make_pair(cmp_iterators_by_values(subresult.first, init.first, comp), cmp_iterators_by_values(init.second, subresult.second, not_pred(comp))); + }, + [=](result_t p1, result_t p2) -> result_t { + return std::make_pair(cmp_iterators_by_values(p1.first, p2.first, comp), cmp_iterators_by_values(p2.second, p1.second, not_pred(comp))); + } + ); + }); } //------------------------------------------------------------------------ @@ -1444,8 +1769,12 @@ std::pair pattern_minmax_element(ForwardIterat //------------------------------------------------------------------------ template std::pair mismatch_serial(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, BinaryPredicate pred) { +#if __PSTL_CPP14_2RANGE_MISMATCH_EQUAL_PRESENT + return std::mismatch(first1, last1, first2, last2, pred); +#else for (; first1 != last1 && first2 != last2 && pred(*first1, *first2); ++first1,++first2){ } - return std::pair(first1, first2); + return std::make_pair(first1, first2); +#endif } template @@ -1455,8 +1784,8 @@ std::pair brick_mismatch(InputIterator1 first1, template std::pair brick_mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, Predicate pred, /* is_vector = */ std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); - return mismatch_serial(first1, last1, first2, last2, pred); + auto n = std::min(last1 - first1, last2 - first2); + return simd_first(first1, n, first2, not_pred(pred)); } template @@ -1466,8 +1795,13 @@ std::pair pattern_mismatch(InputIterator1 first1 template std::pair pattern_mismatch(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, InputIterator2 last2, Predicate pred, IsVector is_vector, /* is_parallel = */ std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_mismatch(first1, last1, first2, last2, pred, is_vector); + return except_handler([=]() { + auto n = std::min(last1 - first1, last2 - first2); + auto result = par_backend::parallel_first(first1, first1+n, [first1, first2, pred, is_vector](InputIterator1 i, InputIterator1 j) { + return brick_mismatch(i, j, first2 + (i - first1), first2 + (j - first1), pred, is_vector).first; + }); + return std::make_pair(result, first2 + (result - first1)); + }); } //------------------------------------------------------------------------ @@ -1496,32 +1830,7 @@ bool pattern_lexicographical_compare(InputIterator1 first1, InputIterator1 last1 return brick_lexicographical_compare(first1, last1, first2, last2, comp, is_vector); } -//------------------------------------------------------------------------ -// move -//------------------------------------------------------------------------ - -template -OutputIterator brick_move(InputIterator first, InputIterator last, OutputIterator d_first, /* is_vector = */ std::false_type) noexcept { - return std::move(first, last, d_first); -} - -template -OutputIterator brick_move(InputIterator first, InputIterator last, OutputIterator d_first, /* is_vector = */ std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Vectorized algorithm unimplemented, redirected to serial"); - return std::move(first, last, d_first); -} - -template -OutputIterator pattern_move(InputIterator first, InputIterator last, OutputIterator d_first, IsVector is_vector, /* is_parallel = */ std::false_type) noexcept { - return brick_move(first, last, d_first, is_vector); -} - -template -OutputIterator pattern_move(InputIterator first, InputIterator last, OutputIterator d_first, IsVector is_vector, /* is_parallel = */ std::true_type) noexcept { - __PSTL_PRAGMA_MESSAGE("Parallel algorithm unimplemented, redirected to serial"); - return brick_move(first, last, d_first, is_vector); -} - -} // namespace __icp_algorithm +} // namespace internal +} // namespace pstl #endif /* __PSTL_algorithm_impl_H */ diff --git a/include/pstl/_internal/common.h b/include/pstl/internal/common.h similarity index 57% rename from include/pstl/_internal/common.h rename to include/pstl/internal/common.h index 9c9bb7a9143..83f5d129088 100644 --- a/include/pstl/_internal/common.h +++ b/include/pstl/internal/common.h @@ -21,59 +21,11 @@ #ifndef __PSTL_common_H #define __PSTL_common_H -// Header contains implementation of common utilities. +#include +#include -#if __PSTL_USE_TBB -#include -#endif - -namespace __icp_algorithm { - -static int __PSTL_get_workers_num() { -#if __PSTL_USE_TBB - return tbb::tbb_thread::hardware_concurrency(); -#else - __PSTL_PRAGMA_MESSAGE("Backend was not specified"); - return 1; -#endif -} - -// FIXME - make grain_size use compiler information, or make parallel_for/parallel_transform_reduce use introspection for -// better estimate. - -//! Helper for parallel_for and parallel_reduce -template -DifferenceType __PSTL_grain_size( DifferenceType m ) { - const size_t oversub = 8; - int n = __PSTL_get_workers_num(); - m /= oversub*n; - const int min_grain = 1; - const int max_grain = 1<<16; - if( mmax_grain ) - m = max_grain; - return m; -} - -//! Raw memory buffer with automatic freeing and no exceptions. -/** Some of our algorithms need to start with raw memory buffer, -not an initialize array, because initialization/destruction -would make the span be at least O(N). */ -class raw_buffer { - void* ptr; - raw_buffer(const raw_buffer&) = delete; - void operator=(const raw_buffer&) = delete; -public: - //! Try to obtain buffer of given size. - raw_buffer(size_t bytes): ptr(operator new(bytes, std::nothrow)) {} - //! True if buffer was successfully obtained, zero otherwise. - operator bool() const { return ptr != NULL; } - //! Return pointer to buffer, or NULL if buffer could not be obtained. - void* get() const { return ptr; } - //! Destroy buffer - ~raw_buffer() { operator delete(ptr); } -}; +namespace pstl { +namespace internal { template typename std::result_of::type except_handler(F f) { @@ -88,6 +40,37 @@ typename std::result_of::type except_handler(F f) { } } +template +void invoke_if(std::true_type, F f) { + f(); +} + +template +void invoke_if(std::false_type, F f) {} + +template +void invoke_if_not(std::false_type, F f) { + f(); +} + +template +void invoke_if_not(std::true_type, F f) {} + +template +typename std::result_of::type invoke_if_else(std::true_type, F1 f1, F2 f2) { + return f1(); +} + +template +typename std::result_of::type invoke_if_else(std::false_type, F1 f1, F2 f2) { + return f2(); +} + +template +typename std::iterator_traits::pointer reduce_to_ptr(Iterator it) { + return std::addressof(*it); +} + //! Unary operator that returns reference to its argument. struct no_op { template @@ -102,7 +85,7 @@ class not_pred { explicit not_pred( Pred pred_ ) : pred(pred_) {} template - bool operator()( Args&& ... args ) const { return !pred(std::forward(args)...); } + bool operator()( Args&& ... args ) { return !pred(std::forward(args)...); } }; template @@ -112,7 +95,7 @@ class reorder_pred { explicit reorder_pred( Pred pred_ ) : pred(pred_) {} template - bool operator()(T&& a, T&& b) const { return pred(std::forward(b), std::forward(a)); } + bool operator()(T&& a, T&& b) { return pred(std::forward(b), std::forward(a)); } }; //! "==" comparison. @@ -148,6 +131,16 @@ class not_equal_value { bool operator()( Arg&& arg ) const { return !(std::forward(arg)==value); } }; -} /* namespace __icp_algorithm */ +template +ForwardIterator cmp_iterators_by_values(ForwardIterator a, ForwardIterator b, Compare comp) { + if(a < b) { // we should return closer iterator + return comp(*b, *a) ? b : a; + } else { + return comp(*a, *b) ? a : b; + } +} + +} // namespace internal +} // namespace pstl #endif /* __PSTL_common_H */ diff --git a/include/pstl/_internal/execution_policy_impl.h b/include/pstl/internal/execution_policy_impl.h similarity index 88% rename from include/pstl/_internal/execution_policy_impl.h rename to include/pstl/internal/execution_policy_impl.h index 11ce2399f71..e6493e09ee8 100644 --- a/include/pstl/_internal/execution_policy_impl.h +++ b/include/pstl/internal/execution_policy_impl.h @@ -26,7 +26,8 @@ #include "../execution" -namespace __icp_algorithm { +namespace pstl { +namespace internal { using namespace pstl::execution; /* predicate */ @@ -37,6 +38,12 @@ template template inline T lazy_and( T a, std::true_type ) { return a; } +template + std::true_type lazy_or( T, std::true_type ) { return std::true_type{}; }; + +template + inline T lazy_or( T a, std::false_type ) { return a; } + /* iterator */ template struct is_random_access_iterator { @@ -96,13 +103,13 @@ template using collector_t = typename policy_traits::type>::collector_type; template using allow_vector = - typename __icp_algorithm::policy_traits::type>::allow_vector; + typename internal::policy_traits::type>::allow_vector; template using allow_unsequenced = - typename __icp_algorithm::policy_traits::type>::allow_unsequenced; + typename internal::policy_traits::type>::allow_unsequenced; template using allow_parallel = - typename __icp_algorithm::policy_traits::type>::allow_parallel; + typename internal::policy_traits::type>::allow_parallel; template @@ -132,7 +139,7 @@ struct prefer_parallel_tag { allow_parallel::value && is_random_access_iterator::value; typedef std::integral_constant type; }; - -} // namespace __icp_algorithm +} // namespace internal +} // namespace pstl #endif /* __PSTL_execution_policy_impl_H */ diff --git a/include/pstl/_internal/numeric_impl.h b/include/pstl/internal/numeric_impl.h similarity index 86% rename from include/pstl/_internal/numeric_impl.h rename to include/pstl/internal/numeric_impl.h index 210b5f9cb22..8e0b2edf304 100644 --- a/include/pstl/_internal/numeric_impl.h +++ b/include/pstl/internal/numeric_impl.h @@ -28,15 +28,14 @@ #include "execution_policy_impl.h" -namespace __icp_algorithm { -//------------------------------------------------------------------------ -// forward -//------------------------------------------------------------------------ -template -T parallel_transform_reduce(Index first, Index last, U u, T init, C combine, R reduce); -template -T parallel_transform_scan(Index n, U u, T init, C combine, R reduce, S scan); - +#if __PSTL_USE_TBB + #include "parallel_impl_tbb.h" +#else + __PSTL_PRAGMA_MESSAGE("Backend was not specified"); +#endif + +namespace pstl { +namespace internal { //------------------------------------------------------------------------ // transform_reduce (version with two binary functions, according to draft N4659) //------------------------------------------------------------------------ @@ -91,13 +90,15 @@ T pattern_transform_reduce(InputIterator1 first1, InputIterator1 last1, InputIte template T pattern_transform_reduce(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, T init, BinaryOperation1 binary_op1, BinaryOperation2 binary_op2, IsVector is_vector, /*is_parallel=*/std::true_type) noexcept { - return parallel_transform_reduce(first1, last1, - [first1, first2, binary_op2](InputIterator1 i) { return binary_op2(*i, *(first2 + (i - first1))); }, - init, - binary_op1, // Combine - [first1, first2, binary_op1, binary_op2, is_vector](InputIterator1 i, InputIterator1 j, T init) -> T { - return brick_transform_reduce(i, j, first2 + (i - first1), - init, binary_op1, binary_op2, is_vector); + return except_handler([&]() { + return par_backend::parallel_transform_reduce(first1, last1, + [first1, first2, binary_op2](InputIterator1 i) mutable { return binary_op2(*i, *(first2 + (i - first1))); }, + init, + binary_op1, // Combine + [first1, first2, binary_op1, binary_op2, is_vector](InputIterator1 i, InputIterator1 j, T init) -> T { + return brick_transform_reduce(i, j, first2 + (i - first1), + init, binary_op1, binary_op2, is_vector); + }); }); } @@ -123,12 +124,14 @@ T pattern_transform_reduce(InputIterator first, InputIterator last, T init, Bina template T pattern_transform_reduce(InputIterator first, InputIterator last, T init, BinaryOperation binary_op, UnaryOperation unary_op, IsVector is_vector, /*is_parallel=*/std::true_type) { - return parallel_transform_reduce(first, last, - [unary_op](InputIterator i) {return unary_op(*i); }, - init, - binary_op, - [unary_op, binary_op, is_vector](InputIterator i, InputIterator j, T init) { - return brick_transform_reduce(i, j, init, binary_op, unary_op, is_vector); + return except_handler([&]() { + return par_backend::parallel_transform_reduce(first, last, + [unary_op](InputIterator i) mutable {return unary_op(*i); }, + init, + binary_op, + [unary_op, binary_op, is_vector](InputIterator i, InputIterator j, T init) { + return brick_transform_reduce(i, j, init, binary_op, unary_op, is_vector); + }); }); } @@ -167,18 +170,21 @@ OutputIterator pattern_transform_scan(InputIterator first, InputIterator last, O template OutputIterator pattern_transform_scan(InputIterator first, InputIterator last, OutputIterator result, UnaryOperation unary_op, T init, BinaryOperation binary_op, Inclusive, IsVector is_vector, /*is_parallel=*/std::true_type ) { typedef typename std::iterator_traits::difference_type difference_type; - parallel_transform_scan( - last-first, - [first, unary_op](size_t i) {return unary_op(first[i]); }, - init, - binary_op, - [first, unary_op, binary_op, is_vector](difference_type i, difference_type j, T init) { - return brick_transform_reduce(first+i, first+j, init, binary_op, unary_op, is_vector); - }, + + return except_handler([=]() { + par_backend::parallel_transform_scan( + last-first, + [first, unary_op](size_t i) mutable {return unary_op(first[i]); }, + init, + binary_op, + [first, unary_op, binary_op, is_vector](difference_type i, difference_type j, T init) { + return brick_transform_reduce(first+i, first+j, init, binary_op, unary_op, is_vector); + }, [first, unary_op, binary_op, result](difference_type i, difference_type j, T init) { return brick_transform_scan(first+i, first+j, result+i, unary_op, init, binary_op, Inclusive()).second; + }); + return result+(last-first); }); - return result+(last-first); } @@ -208,6 +214,7 @@ OutputIterator pattern_adjacent_difference(InputIterator first, InputIterator la return brick_adjacent_difference(first, last, d_first, op, is_vector); } -} // namespace __icp_algorithm +} // namespace internal +} // namespace pstl #endif /* __PSTL_numeric_impl_H */ diff --git a/include/pstl/_internal/parallel_impl_tbb.h b/include/pstl/internal/parallel_impl_tbb.h similarity index 60% rename from include/pstl/_internal/parallel_impl_tbb.h rename to include/pstl/internal/parallel_impl_tbb.h index 90636e5ed79..115f41d9c93 100644 --- a/include/pstl/_internal/parallel_impl_tbb.h +++ b/include/pstl/internal/parallel_impl_tbb.h @@ -21,15 +21,10 @@ #ifndef __PSTL_parallel_impl_tbb_H #define __PSTL_parallel_impl_tbb_H +#include // This header defines the minimum set of parallel routines required to support Parallel STL, // implemented on top of Intel(R) Threading Building Blocks (Intel(R) TBB) library -#include - -#include "common.h" -#include "algorithm_impl.h" -#include "numeric_impl.h" /* count and count_if use pattern_transform_reduce */ - // Bring in minimal required subset of Intel TBB #include #include @@ -42,7 +37,32 @@ #error Intel(R) Threading Building Blocks 2018 is required; older versions are not supported. #endif -namespace __icp_algorithm { +namespace pstl { +namespace par_backend { + +//! Raw memory buffer with automatic freeing and no exceptions. +/** Some of our algorithms need to start with raw memory buffer, +not an initialize array, because initialization/destruction +would make the span be at least O(N). */ +class raw_buffer { + void* ptr; + raw_buffer(const raw_buffer&) = delete; + void operator=(const raw_buffer&) = delete; +public: + //! Try to obtain buffer of given size. + raw_buffer(size_t bytes): ptr(operator new(bytes, std::nothrow)) {} + //! True if buffer was successfully obtained, zero otherwise. + operator bool() const { return ptr != NULL; } + //! Return pointer to buffer, or NULL if buffer could not be obtained. + void* get() const { return ptr; } + //! Destroy buffer + ~raw_buffer() { operator delete(ptr); } +}; + +// Wrapper for tbb::task +inline void cancel_execution() { + tbb::task::self().group()->cancel_group_execution(); +} //------------------------------------------------------------------------ // parallel_for @@ -60,48 +80,28 @@ class parallel_for_body { RealBody my_body; }; -//! Evaluate brick f[i,j) to each subrange [i,j) of [first,last) -// wrapper over tbb::parallel_for with exceptions handler +//! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last) +// wrapper over tbb::parallel_for template void parallel_for(Index first, Index last, F f) { - except_handler([=]() { - tbb::this_task_arena::isolate([=]() { - tbb::parallel_for(tbb::blocked_range(first, last), parallel_for_body(f)); - }); + tbb::this_task_arena::isolate([=]() { + tbb::parallel_for(tbb::blocked_range(first, last), parallel_for_body(f)); }); } -// wrapper over tbb::parallel_reduce with exceptions handler -template -Value parallel_reduce(tbb::blocked_range range, Body &body) { - return except_handler([range, &body]()->Value { - tbb::this_task_arena::isolate([range, &body]() { - tbb::parallel_reduce(range, body); - }); - return body.sum(); - }); -} - -// wrapper over tbb::parallel_reduce with exceptions handler +//! Evaluation of brick f[i,j) for each subrange [i,j) of [first,last) +// wrapper over tbb::parallel_reduce template -Value parallel_reduce(tbb::blocked_range range, const Value& identity, const RealBody& real_body, const Reduction& reduction) { - return except_handler([range, &identity, &real_body, &reduction]()->Value { - return tbb::this_task_arena::isolate([range, &identity, &real_body, &reduction]()->Value { - return tbb::parallel_reduce(range, identity, real_body, reduction); - }); +Value parallel_reduce(Index first, Index last, const Value& identity, const RealBody& real_body, const Reduction& reduction, std::size_t grainsize = 1) { + return tbb::this_task_arena::isolate([first, last, grainsize, &identity, &real_body, &reduction]()->Value { + return tbb::parallel_reduce(tbb::blocked_range(first, last, grainsize), identity, + [real_body](const tbb::blocked_range& r, const Value& value)-> Value { + return real_body(r.begin(), r.end(), value); + }, + reduction); }); } -// wrapper over tbb::parallel_scan with exceptions handler -template -Value parallel_scan(tbb::blocked_range range, Body &body) { - return except_handler([range, &body]()->Value { - tbb::this_task_arena::isolate([range, &body]() { - tbb::parallel_scan(range, body); - }); - return body.sum(); - }); -} //------------------------------------------------------------------------ // parallel_transform_reduce // @@ -164,7 +164,10 @@ template T parallel_transform_reduce( Index first, Index last, U u, T init, C combine, R brick_reduce) { par_trans_red_body body(u, init, combine, brick_reduce); // The grain size of 3 is used in order to provide mininum 2 elements for each body - return __icp_algorithm::parallel_reduce(tbb::blocked_range(first, last, 3), body); + tbb::this_task_arena::isolate([first, last, &body]() { + tbb::parallel_reduce(tbb::blocked_range(first, last, 3), body); + }); + return body.sum(); } //------------------------------------------------------------------------ @@ -246,7 +249,11 @@ template T parallel_transform_scan(Index n, U u, T init, C combine, R brick_reduce, S scan) { if(n) { trans_scan_body body(u, init, combine, brick_reduce, scan); - return __icp_algorithm::parallel_scan(tbb::blocked_range(0, n), body); + auto range = tbb::blocked_range(0, n); + tbb::this_task_arena::isolate([range, &body]() { + tbb::parallel_scan(range, body); + }); + return body.sum(); } else return init; @@ -309,38 +316,36 @@ void downsweep(Index i, Index m, Index tilesize, T* r, Index lastsize, T initial // T must have a trivial constructor and destructor. template void parallel_strict_scan( Index n, T initial, R reduce, C combine, S scan, A apex ) { - except_handler([=]() { - tbb::this_task_arena::isolate([=](){ - if( n>1 ) { - Index p = __PSTL_get_workers_num(); - const Index slack = 4; - Index tilesize = (n-1)/(slack*p) + 1; - Index m = (n-1)/tilesize; - raw_buffer buf((m+1)*sizeof(T)); - if( buf ) { - T* r = static_cast(buf.get()); - upsweep(Index(0), Index(m+1), tilesize, r, n-m*tilesize, reduce, combine); - // When apex is a no-op and combine has no side effects, a good optimizer - // should be able to eliminate all code between here and apex. - // Alternatively, provide a default value for apex that can be - // recognized by metaprogramming that conditionlly executes the following. - size_t k = m+1; - T t = r[k-1]; - while( (k&=k-1) ) - t = combine(r[k-1],t); - apex(combine(initial,t)); - downsweep(Index(0), Index(m+1), tilesize, r, n-m*tilesize, initial, combine, scan); - return; - } + tbb::this_task_arena::isolate([=](){ + if( n>1 ) { + Index p = tbb::this_task_arena::max_concurrency(); + const Index slack = 4; + Index tilesize = (n-1)/(slack*p) + 1; + Index m = (n-1)/tilesize; + raw_buffer buf((m+1)*sizeof(T)); + if( buf ) { + T* r = static_cast(buf.get()); + upsweep(Index(0), Index(m+1), tilesize, r, n-m*tilesize, reduce, combine); + // When apex is a no-op and combine has no side effects, a good optimizer + // should be able to eliminate all code between here and apex. + // Alternatively, provide a default value for apex that can be + // recognized by metaprogramming that conditionlly executes the following. + size_t k = m+1; + T t = r[k-1]; + while( (k&=k-1) ) + t = combine(r[k-1],t); + apex(combine(initial,t)); + downsweep(Index(0), Index(m+1), tilesize, r, n-m*tilesize, initial, combine, scan); + return; } - // Fewer than 2 elements in sequence, or out of memory. Handle has single block. - T sum = initial; - if(n) - sum = combine(sum, reduce(Index(0), n)); - apex(sum); - if(n) - scan(Index(0), n, initial); - }); + } + // Fewer than 2 elements in sequence, or out of memory. Handle has single block. + T sum = initial; + if(n) + sum = combine(sum, reduce(Index(0), n)); + apex(sum); + if(n) + scan(Index(0), n, initial); }); } @@ -351,15 +356,14 @@ void parallel_strict_scan( Index n, T initial, R reduce, C combine, S scan, A ap //! Return true if brick f[i,j) returns true for some subrange [i,j) of [first,last) template bool parallel_or( Index first, Index last, Brick f ) { - return except_handler([=]() -> bool { std::atomic found(false); - __icp_algorithm::parallel_for( first, last, [f,&found]( Index i, Index j ) { - if( f(i, j) ) { - found = true; - tbb::task::self().group()->cancel_group_execution(); - }}); + parallel_for(first, last, [f, &found](Index i, Index j) { + if (!found.load(std::memory_order_relaxed) && f(i, j)) { + found.store(true, std::memory_order_relaxed); + tbb::task::self().group()->cancel_group_execution(); + }} + ); return found; - }); } //------------------------------------------------------------------------ @@ -370,18 +374,17 @@ bool parallel_or( Index first, Index last, Brick f ) { Each f[i,j) must return a value in [i,j). */ template Index parallel_first( Index first, Index last, Brick f ) { - return except_handler([=]() -> Index { typedef typename std::iterator_traits::difference_type difference_type; - difference_type n = last-first; + const difference_type n = last-first; std::atomic minimum( last-first ); - __icp_algorithm::parallel_for(first, last, [f, first, &minimum](Index i, Index j) { + parallel_for(first, last, [f, first, &minimum](Index i, Index j) { // See "Reducing Contention Through Priority Updates", PPoPP '13, for discussion of // why using a shared variable scales fairly well in this situation. if (i - first < minimum) { Index res = f(i, j); // If not 'last' returned then we found what we want so put this to minimum if (res != j) { - difference_type k = res - first; + const difference_type k = res - first; for (difference_type old = minimum; k < old; old = minimum) { minimum.compare_exchange_weak(old, k); } @@ -389,7 +392,6 @@ Index parallel_first( Index first, Index last, Brick f ) { } }); return first + minimum; - }); } //------------------------------------------------------------------------ @@ -403,14 +405,16 @@ Index parallel_first( Index first, Index last, Brick f ) { //------------------------------------------------------------------------ //! Destroy sequence [xs,xe) -template -void serial_destroy(RandomAccessIterator zs, RandomAccessIterator ze) { - typedef typename std::iterator_traits::value_type T; - while(zs!=ze) { - --ze; - (*ze).~T(); +struct serial_destroy { + template + void operator()(RandomAccessIterator zs, RandomAccessIterator ze) { + typedef typename std::iterator_traits::value_type T; + while(zs!=ze) { + --ze; + (*ze).~T(); + } } -} +}; //! Merge sequences [xs,xe) and [ys,ye) to output sequence [zs,(xe-xs)+(ye-ys)), using std::move template @@ -438,7 +442,7 @@ void serial_move_merge(RandomAccessIterator1 xs, RandomAccessIterator1 xe, Rando template void merge_sort_init_temp_buf(RandomAccessIterator1 xs, RandomAccessIterator1 xe, RandomAccessIterator2 zs, bool inplace) { - RandomAccessIterator2 ze = zs + (xe-xs); + const RandomAccessIterator2 ze = zs + (xe-xs); typedef typename std::iterator_traits::value_type T; if(inplace) // Initialize the temporary buffer @@ -450,33 +454,33 @@ void merge_sort_init_temp_buf(RandomAccessIterator1 xs, RandomAccessIterator1 xe new(&*zs) T(std::move(*xs)); } -template +template class merge_task: public tbb::task { /*override*/tbb::task* execute(); RandomAccessIterator1 xs, xe; RandomAccessIterator2 ys, ye; RandomAccessIterator3 zs; Compare comp; - bool destroy; + Cleanup cleanup; public: merge_task( RandomAccessIterator1 xs_, RandomAccessIterator1 xe_, RandomAccessIterator2 ys_, RandomAccessIterator2 ye_, RandomAccessIterator3 zs_, - bool destroy_, Compare comp_) : - xs(xs_), xe(xe_), ys(ys_), ye(ye_), zs(zs_), destroy(destroy_), comp(comp_) + Compare comp_, Cleanup cleanup_) : + xs(xs_), xe(xe_), ys(ys_), ye(ye_), zs(zs_), comp(comp_), cleanup(cleanup_) {} }; -template -tbb::task* merge_task::execute() { +template +tbb::task* merge_task::execute() { const size_t MERGE_CUT_OFF = 2000; - auto n = (xe-xs) + (ye-ys); + const auto n = (xe-xs) + (ye-ys); if(n <= MERGE_CUT_OFF) { serial_move_merge(xs, xe, ys, ye, zs, comp); - if(destroy) { - serial_destroy(xs, xe); - serial_destroy(ys, ye); - } + + //we clean the buffer one time on last step of the sort + cleanup(xs, xe); + cleanup(ys, ye); return NULL; } else { @@ -490,9 +494,9 @@ tbb::task* merge_task + void operator()(T, T) {} +}; + const size_t STABLE_SORT_CUT_OFF = 500; template @@ -525,14 +535,16 @@ tbb::task* stable_sort_task(zs, zm, zm, ze, xs, inplace==2, comp); + if (inplace == 2) + m = new (allocate_continuation()) merge_task(zs, zm, zm, ze, xs, comp, serial_destroy()); + else if (inplace) + m = new (allocate_continuation()) merge_task(zs, zm, zm, ze, xs, comp, binary_no_op()); else - m = new (allocate_continuation()) merge_task(xs, xm, xm, xe, zs, false, comp); + m = new (allocate_continuation()) merge_task(xs, xm, xm, xe, zs, comp, binary_no_op()); m->set_ref_count(2); task* right = new(m->allocate_child()) stable_sort_task(xm,xe,zm,!inplace, comp, leaf_sort); spawn(*right); @@ -545,123 +557,23 @@ tbb::task* stable_sort_task void parallel_stable_sort( RandomAccessIterator xs, RandomAccessIterator xe, Compare comp, LeafSort leaf_sort ) { - except_handler([=]() { - tbb::this_task_arena::isolate([=](){ - typedef typename std::iterator_traits::value_type T; - if( xe-xs > STABLE_SORT_CUT_OFF ) { - __icp_algorithm::raw_buffer buf( sizeof(T)*(xe-xs) ); - if( buf ) { - using tbb::task; - typedef typename std::iterator_traits::value_type T; - task::spawn_root_and_wait(*new( task::allocate_root() ) __icp_algorithm::stable_sort_task( xs, xe, (T*)buf.get(), 2, comp, leaf_sort )); - return; - } + tbb::this_task_arena::isolate([=](){ + typedef typename std::iterator_traits::value_type T; + if( xe-xs > STABLE_SORT_CUT_OFF ) { + raw_buffer buf( sizeof(T)*(xe-xs) ); + if( buf ) { + using tbb::task; + typedef typename std::iterator_traits::value_type T; + task::spawn_root_and_wait(*new( task::allocate_root() ) stable_sort_task( xs, xe, (T*)buf.get(), 2, comp, leaf_sort )); + return; } - // Not enough memory available or sort too small - fall back on serial sort - leaf_sort( xs, xe, comp ); - }); - }); -} - -//------------------------------------------------------------------------ -// parallel_equal -//------------------------------------------------------------------------ -template -bool parallel_equal(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, Pred p, IsVector vec) { - return __icp_algorithm::parallel_reduce( - tbb::blocked_range(first1, last1, 500), true, - [=](const tbb::blocked_range& r, bool is_equal_local) -> bool { - return is_equal_local && brick_equal(r.begin(), r.end(), first2+(r.begin()-first1), p, vec); - }, - [=](const bool is_equal_local1, const bool is_equal_local2) -> bool { - return is_equal_local1 && is_equal_local2; - } - ); -} - -template -bool pattern_equal(InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, BinaryPredicate p, IsVector vec, /*is_parallel=*/std::true_type) { - return parallel_equal(first1, last1, first2, p, vec); -} - -//------------------------------------------------------------------------ -// count -//------------------------------------------------------------------------ -template -typename std::iterator_traits::difference_type -pattern_count(InputIterator first, InputIterator last, Predicate pred, /* is_parallel */ std::true_type, IsVector vec) { - typedef typename std::iterator_traits::difference_type size_type; - return __icp_algorithm::parallel_reduce(tbb::blocked_range(first, last), size_type(0), - [pred, vec](const tbb::blocked_range& r, size_type value)->size_type { - return value+brick_count(r.begin(), r.end(), pred, vec); - }, - std::plus() - ); -} - -//------------------------------------------------------------------------ -// adjacent_find -//------------------------------------------------------------------------ -template -Index pattern_adjacent_find(Index first, Index last, BinaryPredicate pred, /* is_parallel */ std::true_type, IsVector vec, bool or_semantic) { - if(last-first<2) - return last; - - return __icp_algorithm::parallel_reduce(tbb::blocked_range(first, last), last, - [last, pred, vec, or_semantic](const tbb::blocked_range& r, Index value)->Index { - - if(or_semantic && valuecancel_group_execution(); - return value; - } - - if(value>r.begin()) { - //modify local_last to check the predicate on the boundary values; //TODO: to use a custom tbb::blocked_range with boundaries overlapping - Index local_last = r.end(); - if(local_last!=last) - ++local_last; - - //correct the global result iterator if the "brick" returns a local "last" - const Index res = brick_adjacent_find(r.begin(), local_last, pred, vec, or_semantic); - if(resIndex { return x -Index pattern_fill(Index first, Index last, const T& value, /*is_parallel=*/std::true_type, IsVector vec) { - __icp_algorithm::parallel_for(first, last, [&value, vec](Index begin, Index end) { - brick_fill(begin, end, value, vec); }); - return last; -} - -template -Index pattern_fill_n(Index first, Size count, const T& value, /*is_parallel=*/std::true_type, IsVector vec) { - return pattern_fill(first, first + count, value, std::true_type(), vec); -} - -//------------------------------------------------------------------------ -// generate, generate_n -//------------------------------------------------------------------------ -template -Index pattern_generate(Index first, Index last, Generator g, /*is_parallel=*/std::true_type, IsVector vec) { - __icp_algorithm::parallel_for(first, last, [g, vec](Index begin, Index end) { - brick_generate(begin, end, g, vec); }); - return last; -} - -template -Index pattern_generate_n(Index first, Size count, Generator g, /*is_parallel=*/std::true_type, IsVector vec) { - return pattern_generate(first, first + count, g, std::true_type(), vec); + // Not enough memory available or sort too small - fall back on serial sort + leaf_sort( xs, xe, comp ); + }); } -} // namespace __icp_algorithm +} // namespace par_backend +} // namespace pstl #endif /* __PSTL_parallel_impl_tbb_H */ diff --git a/include/pstl/_internal/pstl_config.h b/include/pstl/internal/pstl_config.h similarity index 81% rename from include/pstl/_internal/pstl_config.h rename to include/pstl/internal/pstl_config.h index c7d2daf011b..2e9af9c1b81 100644 --- a/include/pstl/_internal/pstl_config.h +++ b/include/pstl/internal/pstl_config.h @@ -21,6 +21,10 @@ #ifndef __PSTL_config_H #define __PSTL_config_H +#define PSTL_VERSION 102 +#define PSTL_VERSION_MAJOR (PSTL_VERSION/100) +#define PSTL_VERSION_MINOR (PSTL_VERSION - PSTL_VERSION_MAJOR * 100) + #if _WIN32 && __PSTL_SHARED_LINKAGE #if __PSTL_EXPORTS #define __PSTL_API __declspec(dllexport) @@ -63,6 +67,10 @@ #define __PSTL_STRING(x) __PSTL_STRING_AUX(x) #define __PSTL_STRING_CONCAT(x, y) x#y +// note that when ICC or Clang is in use, __PSTL_GCC_VERSION might not fully match +// the actual GCC version on the system. +#define __PSTL_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) + // Enable SIMD for compilers that support OpenMP 4.0 #if (_OPENMP >= 201307) || (__INTEL_COMPILER >= 1600) || (__PSTL_GCC_VERSION >= 40900) #define __PSTL_PRAGMA_SIMD __PSTL_PRAGMA(omp simd) @@ -75,13 +83,11 @@ #define __PSTL_PRAGMA_SIMD_REDUCTION(PRM) #endif //Enable SIMD -// note that when ICC or Clang is in use, __PSTL_GCC_VERSION might not fully match -// the actual GCC version on the system. -#define __PSTL_GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) - // Should be defined to 1 for environments with a vendor implementation of C++17 execution policies #define __PSTL_CPP17_EXECUTION_POLICIES_PRESENT 0 +#define __PSTL_CPP14_2RANGE_MISMATCH_EQUAL_PRESENT (_MSC_VER >= 1900 || __cplusplus >= 201300L || __cpp_lib_robust_nonmodifying_seq_ops == 201304) +#define __PSTL_CPP14_MAKE_REVERSE_ITERATOR_PRESENT (_MSC_VER >= 1900 || __cplusplus >= 201402L || __cpp_lib_make_reverse_iterator == 201402) #define __PSTL_CPP14_INTEGER_SEQUENCE_PRESENT (_MSC_VER >= 1900 || __cplusplus >= 201402L) #define __PSTL_CPP14_VARIABLE_TEMPLATES_PRESENT \ (!__INTEL_COMPILER || __INTEL_COMPILER >= 1700) && (_MSC_FULL_VER >= 190023918 || __cplusplus >= 201402L) @@ -97,8 +103,10 @@ #if __PSTL_MONOTONIC_PRESENT #define __PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(PRM) __PSTL_PRAGMA(omp ordered simd monotonic(PRM)) +#define __PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC_2ARGS(PRM1, PRM2) __PSTL_PRAGMA(omp ordered simd monotonic(PRM1, PRM2)) #else #define __PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(PRM) +#define __PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC_2ARGS(PRM1, PRM2) #endif #if (__INTEL_COMPILER >= 1600) @@ -107,6 +115,13 @@ #define __PSTL_PRAGMA_VECTOR_UNALIGNED #endif +// Check the user-defined macro to use non-temporal stores +#if defined(PSTL_USE_NONTEMPORAL_STORES) && (__INTEL_COMPILER >= 1600) +#define __PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED __PSTL_PRAGMA(vector nontemporal) +#else +#define __PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED +#endif + #if _MSC_VER || __INTEL_COMPILER //the preprocessors don't type a message location #define __PSTL_PRAGMA_LOCATION __FILE__ ":" __PSTL_STRING(__LINE__) ": warning: " #else diff --git a/include/pstl/_internal/simd_impl.h b/include/pstl/internal/simd_impl.h similarity index 70% rename from include/pstl/_internal/simd_impl.h rename to include/pstl/internal/simd_impl.h index d5a0965fc32..5ab5820cc68 100644 --- a/include/pstl/_internal/simd_impl.h +++ b/include/pstl/internal/simd_impl.h @@ -29,14 +29,16 @@ // This header defines the minimum set of vector routines required // to support parallel STL. - -namespace __icp_algorithm { +namespace pstl { +namespace internal { template -void simd_walk_1(Iterator first, DifferenceType n, Function f) noexcept { +Iterator simd_walk_1(Iterator first, DifferenceType n, Function f) noexcept { __PSTL_PRAGMA_SIMD for(DifferenceType i = 0; i < n; ++i) f(first[i]); + + return first + n; } template @@ -59,18 +61,19 @@ __PSTL_PRAGMA_SIMD template bool simd_or(Index first, DifferenceType n, Pred pred) noexcept { #if __PSTL_EARLYEXIT_PRESENT + DifferenceType i; __PSTL_PRAGMA_SIMD_EARLYEXIT - for(DifferenceType i = 0; i < n; ++i) + for(i = 0; i < n; ++i) if(pred(first[i])) - return true; - return false; + break; + return i < n; #else DifferenceType block_size = std::min(4, n); const Index last = first + n; while ( last != first ) { - int flag = 1; + int32_t flag = 1; __PSTL_PRAGMA_SIMD_REDUCTION(&:flag) - for ( int i = 0; i < block_size; ++i ) + for ( DifferenceType i = 0; i < block_size; ++i ) if ( pred(*(first + i)) ) flag = 0; if ( !flag ) @@ -102,20 +105,20 @@ __PSTL_PRAGMA_SIMD_EARLYEXIT #else const Index last = first + n; // Experiments show good block sizes like this - const int block_size = 8; - alignas(64) int lane[block_size] = {0}; + const DifferenceType block_size = 8; + alignas(64) DifferenceType lane[block_size] = {0}; while ( last - first >= block_size ) { - int found = 0; + DifferenceType found = 0; __PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part __PSTL_PRAGMA_SIMD_REDUCTION(|:found) - for ( int i = 0; i < block_size; ++i ) { + for ( DifferenceType i = 0; i < block_size; ++i ) { // To improve SIMD vectorization - int t = (pred(*(first + i))); + const DifferenceType t = (pred(*(first + i))); lane[i] = t; found |= t; } if ( found ) { - int i; + DifferenceType i; // This will vectorize for ( i = 0; i < block_size; ++i ) { if ( lane[i] ) break; @@ -136,36 +139,37 @@ __PSTL_PRAGMA_SIMD_REDUCTION(|:found) } template -Index1 simd_first(Index1 first1, DifferenceType n, Index2 first2, Pred pred) noexcept { +std::pair simd_first(Index1 first1, DifferenceType n, Index2 first2, Pred pred) noexcept { #if __PSTL_EARLYEXIT_PRESENT DifferenceType i = 0; __PSTL_PRAGMA_SIMD_EARLYEXIT for(;i < n; ++i) if(pred(first1[i], first2[i])) break; - return first1+i; + return std::make_pair(first1 + i, first2 + i); #else - Index1 last1 = first1 + n; + const Index1 last1 = first1 + n; + const Index2 last2 = first2 + n; // Experiments show good block sizes like this - const int block_size = 8; - alignas(64) int lane[block_size] = {0}; + const DifferenceType block_size = 8; + alignas(64) DifferenceType lane[block_size] = {0}; while ( last1 - first1 >= block_size ) { - int found = 0; - int i; + DifferenceType found = 0; + DifferenceType i; __PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part __PSTL_PRAGMA_SIMD_REDUCTION(|:found) for ( i = 0; i < block_size; ++i ) { - int t = pred(first1[i], first2[i]); + const DifferenceType t = pred(first1[i], first2[i]); lane[i] = t; found |= t; } if ( found ) { - int i; + DifferenceType i; // This will vectorize for ( i = 0; i < block_size; ++i ) { if ( lane[i] ) break; } - return first1 + i; + return std::make_pair(first1 + i, first2 + i); } first1 += block_size; first2 += block_size; @@ -174,9 +178,9 @@ __PSTL_PRAGMA_SIMD_REDUCTION(|:found) //Keep remainder scalar for(; last1 != first1; ++first1, ++first2) if ( pred(*(first1), *(first2)) ) - return first1; + return std::make_pair(first1, first2); - return last1; + return std::make_pair(last1, last2); #endif //__PSTL_EARLYEXIT_PRESENT } @@ -210,12 +214,13 @@ __PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(cnt:1) return result + cnt; } -template -OutputIterator simd_copy_n(InputIterator first, DifferenceType n, OutputIterator result) noexcept { +template +OutputIterator simd_copy_move(InputIterator first, DifferenceType n, OutputIterator result, Assigner assigner) noexcept { +__PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED __PSTL_PRAGMA_SIMD - for(DifferenceType i = 0; i < n; ++i) - result[i] = first[i]; - return result+n; + for (DifferenceType i = 0; i < n; ++i) + assigner(first + i, result + i); + return result + n; } template @@ -270,40 +275,52 @@ __PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC(cnt:1) } } +template +void simd_partition_by_mask(InputIterator first, DifferenceType n, OutputIterator1 out_true, OutputIterator2 out_false, bool* mask) noexcept { + DifferenceType cnt_true = 0, cnt_false = 0; +__PSTL_PRAGMA_SIMD + for (DifferenceType i = 0; i < n; ++i) { +__PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC_2ARGS(cnt_true:1, cnt_false:1) + if (mask[i]) { + out_true[cnt_true] = first[i]; + ++cnt_true; + } + else { + out_false[cnt_false] = first[i]; + ++cnt_false; + } + } +} + template Index simd_fill_n(Index first, DifferenceType n, const T& value) noexcept { +__PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED __PSTL_PRAGMA_SIMD for (DifferenceType i = 0; i < n; ++i) first[i] = value; return first + n; } -template -void simd_fill(Index first, Index last, const T& value) noexcept { - simd_fill_n(first, last - first, value); -} - template Index simd_generate_n(Index first, DifferenceType size, Generator g) noexcept { +__PSTL_USE_NONTEMPORAL_STORES_IF_ALLOWED __PSTL_PRAGMA_SIMD for (DifferenceType i = 0; i < size; ++i) first[i] = g(); return first + size; } -template -void simd_generate(Index first, Index last, Generator g) noexcept { - simd_generate_n(first, last - first, g); -} - template Index simd_adjacent_find(Index first, Index last, BinaryPredicate pred, bool or_semantic) noexcept { if(last - first < 2) return last; + typedef typename std::iterator_traits::difference_type difference_type; + difference_type i = 0; + #if __PSTL_EARLYEXIT_PRESENT //Some compiler versions fail to compile the following loop when iterators are used. Indices are used instead - size_t i = 0, n = last-first-1; + const difference_type n = last-first-1; __PSTL_PRAGMA_SIMD_EARLYEXIT for(; i < n; ++i) if(pred(first[i], first[i+1])) @@ -313,15 +330,15 @@ __PSTL_PRAGMA_SIMD_EARLYEXIT #else // Experiments show good block sizes like this //TODO: to consider tuning block_size for various data types - const int block_size = 8; - alignas(64) int lane[block_size] = {0}; + const difference_type block_size = 8; + alignas(64) difference_type lane[block_size] = {0}; while ( last - first >= block_size ) { - int found = 0, i; + difference_type found = 0; __PSTL_PRAGMA_VECTOR_UNALIGNED // Do not generate peel loop part __PSTL_PRAGMA_SIMD_REDUCTION(|:found) for ( i = 0; i < block_size-1; ++i ) { //TODO: to improve SIMD vectorization - const int t = pred(*(first + i), *(first + i + 1)); + const difference_type t = pred(*(first + i), *(first + i + 1)); lane[i] = t; found |= t; } @@ -333,7 +350,7 @@ __PSTL_PRAGMA_SIMD_REDUCTION(|:found) if ( found ) { if(or_semantic) return first; - int i; + // This will vectorize for ( i = 0; i < block_size; ++i ) if ( lane[i] ) break; @@ -352,23 +369,30 @@ __PSTL_PRAGMA_SIMD_REDUCTION(|:found) template Index1 simd_search(Index1 first, Index1 last, Index2 s_first, Index2 s_last, BinaryPredicate p, bool b_first) noexcept { - auto n2 = s_last - s_first; + const auto n2 = s_last - s_first; if(n2 < 1) - return last; + return b_first ? first : last; auto n1 = last - first; if(n1 < n2) return last; - Index1 result = last; - for(auto i = n1-n2; i >= 0; --i, ++first) { - if(simd_first(s_first, s_last-s_first, first, not_pred(p)) == s_last) {//subsequence was found - result = first; - if(b_first) //first occurrence semantic - break; + if (!b_first) + first = last - n2; + + while( n1 >= n2) { + if (simd_first(s_first, n2, first, not_pred(p)).first == s_last) {//subsequence was found + return first; + } + if (b_first) { + ++first; } + else if (n1 != n2) { + --first; + } + --n1; } - return result; + return last; } template @@ -387,5 +411,43 @@ __PSTL_PRAGMA_SIMD_REDUCTION(+:init) return init; }; -} // namespace __icp_algorithm +template +Iterator simd_it_walk_1(Iterator first, DifferenceType n, Function f) noexcept { +__PSTL_PRAGMA_SIMD + for(DifferenceType i = 0; i < n; ++i) + f(first + i); + + return first + n; +} + +template +Iterator2 simd_it_walk_2(Iterator1 first1, DifferenceType n, Iterator2 first2, Function f) noexcept { +__PSTL_PRAGMA_SIMD + for (DifferenceType i = 0; i < n; ++i) + f(first1 + i, first2 + i); + return first2 + n; +} + +template +std::pair +simd_partition_copy(InputIterator first, DifferenceType n, OutputIterator1 out_true, OutputIterator2 out_false, UnaryPredicate pred) noexcept { + DifferenceType cnt_true = 0, cnt_false = 0; + +__PSTL_PRAGMA_SIMD + for (DifferenceType i = 0; i < n; ++i) { +__PSTL_PRAGMA_SIMD_ORDERED_MONOTONIC_2ARGS(cnt_true:1, cnt_false : 1) + if (pred(first[i])) { + out_true[cnt_true] = first[i]; + ++cnt_true; + } + else { + out_false[cnt_false] = first[i]; + ++cnt_false; + } + } + return std::make_pair(out_true + cnt_true, out_false + cnt_false); +} +} // namespace internal +} // namespace pstl + #endif /* __PSTL_vector_impl_H */ diff --git a/include/pstl/iterators.h b/include/pstl/iterators.h index 85ff97c0763..da5edda8126 100644 --- a/include/pstl/iterators.h +++ b/include/pstl/iterators.h @@ -204,7 +204,4 @@ zip_iterator make_zip_iterator(T... args) { return zip_iterator(args } //namespace pstl -namespace __icp_algorithm { -} - #endif /* __PSTL_iterators_H */ diff --git a/include/pstl/memory b/include/pstl/memory index 19d92c808a3..8edbb9f0b47 100644 --- a/include/pstl/memory +++ b/include/pstl/memory @@ -21,117 +21,219 @@ #ifndef __PSTL_memory_H #define __PSTL_memory_H -#include "_internal/pstl_config.h" -#include "_internal/memory_impl.h" +#include "internal/pstl_config.h" +#include "internal/common.h" +#include "internal/algorithm_impl.h" +#include "internal/simd_impl.h" namespace std { -// [uninitialized.copy] - template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy uninitialized_copy(ExecutionPolicy&& exec, InputIterator first, InputIterator last, ForwardIterator result) { - return __icp_algorithm::pattern_uninitialized_copy(first, last, result, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + typedef typename iterator_traits::value_type value_type1; + typedef typename iterator_traits::value_type value_type2; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + return invoke_if_else(std::integral_constant::value && std::is_trivial::value>(), + [&]() { return pattern_walk2_brick(first, last, result, [is_vector](InputIterator begin, InputIterator end, ForwardIterator res) + { return brick_copy(begin, end, res, is_vector); }, is_parallel); }, + [&]() { return pattern_it_walk2(first, last, result, [](InputIterator it1, ForwardIterator it2) + { ::new (reduce_to_ptr(it2)) value_type2(*it1); }, is_vector, is_parallel); } + ); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy uninitialized_copy_n(ExecutionPolicy&& exec, InputIterator first, Size n, ForwardIterator result) { - return __icp_algorithm::pattern_uninitialized_copy_n(first, n, result, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + typedef typename iterator_traits::value_type value_type1; + typedef typename iterator_traits::value_type value_type2; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + return invoke_if_else(std::integral_constant::value && std::is_trivial::value>(), + [&]() { return pattern_walk2_brick_n(first, n, result, [is_vector](InputIterator begin, Size sz, ForwardIterator res) + { return brick_copy_n(begin, sz, res, is_vector); }, is_parallel); }, + [&]() { return pattern_it_walk2_n(first, n, result, [](InputIterator it1, ForwardIterator it2) + { ::new (reduce_to_ptr(it2)) value_type2(*it1); }, is_vector, is_parallel); } + ); } // [uninitialized.move] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy uninitialized_move(ExecutionPolicy&& exec, InputIterator first, InputIterator last, ForwardIterator result) { - return __icp_algorithm::pattern_uninitialized_move(first, last, result, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + typedef typename iterator_traits::value_type value_type1; + typedef typename iterator_traits::value_type value_type2; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + return invoke_if_else(std::integral_constant::value && std::is_trivial::value>(), + [&]() { return pattern_walk2_brick(first, last, result, [is_vector](InputIterator begin, InputIterator end, ForwardIterator res) + { return brick_copy(begin, end, res, is_vector);}, is_parallel); }, + [&]() { return pattern_it_walk2(first, last, result, [](InputIterator it1, ForwardIterator it2) + { ::new (reduce_to_ptr(it2)) value_type2(std::move(*it1)); }, is_vector, is_parallel); } + ); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy uninitialized_move_n(ExecutionPolicy&& exec, InputIterator first, Size n, ForwardIterator result) { - return __icp_algorithm::pattern_uninitialized_move_n(first, n, result, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + typedef typename iterator_traits::value_type value_type1; + typedef typename iterator_traits::value_type value_type2; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + return invoke_if_else(std::integral_constant::value && std::is_trivial::value>(), + [&]() { return pattern_walk2_brick_n(first, n, result, [is_vector](InputIterator begin, Size sz, ForwardIterator res) + { return brick_copy_n(begin, sz, res, is_vector);}, is_parallel); }, + [&]() { return pattern_it_walk2_n(first, n, result, [](InputIterator it1, ForwardIterator it2) + { ::new (reduce_to_ptr(it2)) value_type2(std::move(*it1)); }, is_vector, is_parallel); } + ); } // [uninitialized.fill] template -__icp_algorithm::enable_if_execution_policy -uninitialized_fill(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, const T& x) { - return __icp_algorithm::pattern_uninitialized_fill(first, last, x, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); +pstl::internal::enable_if_execution_policy +uninitialized_fill(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last, const T& value) { + typedef typename iterator_traits::value_type value_type; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + invoke_if_else(std::is_arithmetic(), + [&]() { pattern_walk_brick(first, last, [&value, &is_vector](ForwardIterator begin, ForwardIterator end) + { brick_fill(begin, end, value_type(value), is_vector);}, is_parallel); }, + [&]() { pattern_it_walk1(first, last, [&value](ForwardIterator it) + { ::new (reduce_to_ptr(it)) value_type(value); }, is_vector, is_parallel); } + ); } template -__icp_algorithm::enable_if_execution_policy -uninitialized_fill_n(ExecutionPolicy&& exec, ForwardIterator first, Size n, const T& x) { - return __icp_algorithm::pattern_uninitialized_fill_n(first, n, x, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); +pstl::internal::enable_if_execution_policy +uninitialized_fill_n(ExecutionPolicy&& exec, ForwardIterator first, Size n, const T& value) { + typedef typename iterator_traits::value_type value_type; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + return invoke_if_else(std::is_arithmetic(), + [&]() { return pattern_walk_brick_n(first, n, [&value, &is_vector](ForwardIterator begin, Size count) + { return brick_fill_n(begin, count, value_type(value), is_vector);}, is_parallel); }, + [&]() { return pattern_it_walk1_n(first, n, [&value](ForwardIterator it) + { ::new (reduce_to_ptr(it)) value_type(value); }, is_vector, is_parallel); } + ); } // [specialized.destroy] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy destroy(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last) { - __icp_algorithm::pattern_destroy(first, last, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + typedef typename iterator_traits::value_type value_type; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + invoke_if_not(std::is_trivially_destructible(), + [&]() { pattern_it_walk1(first, last, [](ForwardIterator it){ (*it).~value_type(); }, is_vector, is_parallel); } + ); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy destroy_n(ExecutionPolicy&& exec, ForwardIterator first, Size n) { - return __icp_algorithm::pattern_destroy_n(first, n, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + typedef typename iterator_traits::value_type value_type; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + return invoke_if_else(std::is_trivially_destructible(), + [&]() { return std::next(first, n);}, + [&]() { return pattern_it_walk1_n(first, n, [](ForwardIterator it){ (*it).~value_type(); }, is_vector, is_parallel); } + ); } // [uninitialized.construct.default] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy uninitialized_default_construct(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last) { - __icp_algorithm::pattern_uninitialized_default_construct(first, last, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + typedef typename iterator_traits::value_type value_type; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + invoke_if_not(std::is_trivial(), + [&]() { pattern_it_walk1(first, last, [](ForwardIterator it) { ::new (reduce_to_ptr(it)) value_type; }, is_vector, is_parallel); }); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy uninitialized_default_construct_n(ExecutionPolicy&& exec, ForwardIterator first, Size n) { - return __icp_algorithm::pattern_uninitialized_default_construct_n(first, n, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + typedef typename iterator_traits::value_type value_type; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + return invoke_if_else(std::is_trivial(), + [&]() { return std::next(first, n);}, + [&]() { return pattern_it_walk1_n(first, n, [](ForwardIterator it) + { ::new (reduce_to_ptr(it)) value_type; }, is_vector, is_parallel); } + ); } // [uninitialized.construct.value] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy uninitialized_value_construct(ExecutionPolicy&& exec, ForwardIterator first, ForwardIterator last) { - __icp_algorithm::pattern_uninitialized_value_construct(first, last, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + typedef typename iterator_traits::value_type value_type; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + invoke_if_else(std::is_trivial(), + [&]() { pattern_walk_brick(first, last, [is_vector](ForwardIterator begin, ForwardIterator end) + { brick_fill(begin, end, value_type(), is_vector);}, is_parallel); }, + [&]() { pattern_it_walk1(first, last, [](ForwardIterator it) + { ::new (reduce_to_ptr(it)) value_type(); }, is_vector, is_parallel); } + ); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy uninitialized_value_construct_n(ExecutionPolicy&& exec, ForwardIterator first, Size n) { - return __icp_algorithm::pattern_uninitialized_value_construct_n(first, n, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + typedef typename iterator_traits::value_type value_type; + using namespace pstl::internal; + + const auto is_parallel = is_parallelization_preferred(exec); + const auto is_vector = is_vectorization_preferred(exec); + + return invoke_if_else(std::is_trivial(), + [&]() { return pattern_walk_brick_n(first, n, [is_vector](ForwardIterator begin, Size count) + { return brick_fill_n(begin, count, value_type(), is_vector);}, is_parallel); }, + [&]() { return pattern_it_walk1_n(first, n, [](ForwardIterator it) + { ::new (reduce_to_ptr(it)) value_type(); }, is_vector, is_parallel); } + ); } } // namespace std diff --git a/include/pstl/numeric b/include/pstl/numeric index c1b190e1bff..11f6ea9fc55 100644 --- a/include/pstl/numeric +++ b/include/pstl/numeric @@ -23,141 +23,128 @@ #include -#include "_internal/pstl_config.h" -#include "_internal/common.h" -#include "_internal/numeric_impl.h" -#if __PSTL_USE_TBB -#include "_internal/parallel_impl_tbb.h" -#else - __PSTL_PRAGMA_MESSAGE("Backend was not specified"); -#endif +#include "internal/pstl_config.h" +#include "internal/common.h" +#include "internal/numeric_impl.h" namespace std { // [reduce] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy reduce(ExecutionPolicy&& exec, InputIterator first, InputIterator last, T init, BinaryOperation binary_op) { - return transform_reduce(exec, first, last, init, binary_op, __icp_algorithm::no_op()); + return transform_reduce(exec, first, last, init, binary_op, pstl::internal::no_op()); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy reduce(ExecutionPolicy&& exec, InputIterator first, InputIterator last, T init) { - return transform_reduce(exec, first, last, init, std::plus(), __icp_algorithm::no_op()); + return transform_reduce(exec, first, last, init, std::plus(), pstl::internal::no_op()); } template -__icp_algorithm::enable_if_execution_policy::value_type> +pstl::internal::enable_if_execution_policy::value_type> reduce(ExecutionPolicy&& exec, InputIterator first, InputIterator last) { typedef typename decay::value_type>::type T; - return transform_reduce(exec, first, last, T{}, std::plus(), __icp_algorithm::no_op()); + return transform_reduce(exec, first, last, T{}, std::plus(), pstl::internal::no_op()); } // [transform.reduce] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy transform_reduce(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, T init) { typedef typename iterator_traits::value_type input_type; - return __icp_algorithm::pattern_transform_reduce(first1, last1, first2, init, std::plus(), std::multiplies(), - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_transform_reduce(first1, last1, first2, init, std::plus(), std::multiplies(), + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy transform_reduce(ExecutionPolicy&& exec, InputIterator1 first1, InputIterator1 last1, InputIterator2 first2, T init, BinaryOperation1 binary_op1, BinaryOperation2 binary_op2) { - return __icp_algorithm::pattern_transform_reduce(first1, last1, first2, init, binary_op1, binary_op2, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_transform_reduce(first1, last1, first2, init, binary_op1, binary_op2, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy transform_reduce(ExecutionPolicy&& exec, InputIterator first, InputIterator last, T init, BinaryOperation binary_op, UnaryOperation unary_op) { - return __icp_algorithm::pattern_transform_reduce(first, last, init, binary_op, unary_op, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_transform_reduce(first, last, init, binary_op, unary_op, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } // [exclusive.scan] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy exclusive_scan(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, T init) { - return transform_exclusive_scan(exec, first, last, result, init, std::plus(), __icp_algorithm::no_op()); + return transform_exclusive_scan(exec, first, last, result, init, std::plus(), pstl::internal::no_op()); } template OutputIterator exclusive_scan(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, T init, BinaryOperation binary_op) { - return transform_exclusive_scan(exec, first, last, result, init, binary_op, __icp_algorithm::no_op()); + return transform_exclusive_scan(exec, first, last, result, init, binary_op, pstl::internal::no_op()); } // [inclusive.scan] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy inclusive_scan(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result) { typedef typename iterator_traits::value_type input_type; - - if( first!=last ) { - auto tmp = *first; - *result = tmp; - return transform_inclusive_scan(exec, ++first, last, ++result, std::plus(), __icp_algorithm::no_op(), tmp); - } else { - return result; - } + return transform_inclusive_scan(exec, first, last, result, std::plus(), pstl::internal::no_op()); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy inclusive_scan(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, BinaryOperation binary_op) { - if( first!=last ) { - auto tmp = *first; - *result = tmp; - return transform_inclusive_scan(exec, ++first, last, ++result, binary_op, __icp_algorithm::no_op(), tmp); - } else { - return result; - } + return transform_inclusive_scan(exec, first, last, result, binary_op, pstl::internal::no_op()); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy inclusive_scan(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, BinaryOperation binary_op, T init) { - return transform_inclusive_scan(exec, first, last, result, binary_op, __icp_algorithm::no_op(), init); + return transform_inclusive_scan(exec, first, last, result, binary_op, pstl::internal::no_op(), init); } // [transform.exclusive.scan] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy transform_exclusive_scan(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, T init, BinaryOperation binary_op, UnaryOperation unary_op) { - return __icp_algorithm::pattern_transform_scan( + using namespace pstl::internal; + return pattern_transform_scan( first, last, result, unary_op, init, binary_op, /*inclusive=*/std::false_type(), - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } // [transform.inclusive.scan] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy transform_inclusive_scan(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, BinaryOperation binary_op, UnaryOperation unary_op, T init) { - return __icp_algorithm::pattern_transform_scan( + using namespace pstl::internal; + return pattern_transform_scan( first, last, result, unary_op, init, binary_op, /*inclusive=*/std::true_type(), - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy transform_inclusive_scan(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator result, BinaryOperation binary_op, UnaryOperation unary_op) { if( first!=last ) { - auto tmp = *first; + auto tmp = unary_op(*first); *result = tmp; return transform_inclusive_scan(exec, ++first, last, ++result, binary_op, unary_op, tmp); } else { @@ -168,15 +155,16 @@ transform_inclusive_scan(ExecutionPolicy&& exec, InputIterator first, InputIter // [adjacent.difference] template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy adjacent_difference(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator d_first, BinaryOperation op) { - return __icp_algorithm::pattern_adjacent_difference(first, last, d_first, op, - __icp_algorithm::is_vectorization_preferred(exec), - __icp_algorithm::is_parallelization_preferred(exec)); + using namespace pstl::internal; + return pattern_adjacent_difference(first, last, d_first, op, + is_vectorization_preferred(exec), + is_parallelization_preferred(exec)); } template -__icp_algorithm::enable_if_execution_policy +pstl::internal::enable_if_execution_policy adjacent_difference(ExecutionPolicy&& exec, InputIterator first, InputIterator last, OutputIterator d_first) { typedef typename iterator_traits::value_type value_type; return adjacent_difference(exec, first, last, d_first, std::minus());