Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Various performance improvements #83

Merged
merged 24 commits into from
Oct 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
96719ff
Changed the method used for the small sort
sterrettm2 Sep 22, 2023
2a037ac
Changed how partition code shortens the array before the main loop
sterrettm2 Sep 22, 2023
9915834
Changed core partition logic
sterrettm2 Sep 22, 2023
7675e11
Larger pivot code
sterrettm2 Sep 25, 2023
818adb4
Changed some parameters
sterrettm2 Sep 25, 2023
d6b37e6
Increased the amount of prefetch done (and small parameter changes)
sterrettm2 Sep 29, 2023
40672c2
Made sure min/max values are updated correctly; other small fixes
sterrettm2 Oct 2, 2023
2b563a2
clang-format
sterrettm2 Oct 4, 2023
c3f855f
Added reference to README
sterrettm2 Oct 4, 2023
6c0704b
Changed types in many places, removed unused bitonic sort logic
sterrettm2 Oct 5, 2023
77fb71e
Move pivot selection to its own file
r-devulap Oct 6, 2023
4def001
Include xss-network-sort in common ile
r-devulap Oct 6, 2023
44407b6
Minor changes to partition code
sterrettm2 Oct 5, 2023
3a4a1e9
Changed unroll aligner to use partition_vec
sterrettm2 Oct 10, 2023
6dcd295
Changed unroll alignment code to keep values in registers and partiti…
sterrettm2 Oct 10, 2023
3dd2d13
Changed how partitioning small arrays is handled
sterrettm2 Oct 10, 2023
8549cc3
Add new CI to test parition_avx512 code
r-devulap Oct 11, 2023
f60a119
Fixed minor bug in partition code
sterrettm2 Oct 12, 2023
506fe06
Fix formatting and add a few comments
r-devulap Oct 12, 2023
ed651eb
Fix more formatting
r-devulap Oct 12, 2023
01f0f02
Use num_unroll = 9 for 16-bit data
r-devulap Oct 12, 2023
cbd6179
Do not use a set seed in tests
r-devulap Oct 12, 2023
fceccc3
Add comments
r-devulap Oct 13, 2023
68e5393
Bug fix in tests
r-devulap Oct 13, 2023
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 31 additions & 0 deletions .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,34 @@ jobs:

- name: Run test suite on SPR
run: sde -spr -- ./builddir/testexe

SPR-gcc13-min-networksort:

runs-on: intel-ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Install dependencies
run: |
sudo apt update
sudo apt -y install g++-13 libgtest-dev meson curl git cmake

- name: Install Intel SDE
run: |
curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz
mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde

- name: Build
env:
CXX: g++-13
CXXFLAGS: -DXSS_MINIMAL_NETWORK_SORT
run: |
make clean
meson setup --warnlevel 2 --werror --buildtype release builddir
cd builddir
ninja

- name: Run test suite on SPR
run: sde -spr -- ./builddir/testexe
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# When unset, discover g++. Prioritise the latest version on the path.
ifeq (, $(and $(strip $(CXX)), $(filter-out default undefined, $(origin CXX))))
override CXX := $(shell which g++-12 g++-11 g++-10 g++-9 g++-8 g++ 2>/dev/null | head -n 1)
override CXX := $(shell which g++-13 g++-12 g++-11 g++-10 g++-9 g++-8 g++ 2>/dev/null | head -n 1)
ifeq (, $(strip $(CXX)))
$(error Could not locate the g++ compiler. Please manually specify its path using the CXX variable)
endif
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,4 @@ Skylake https://arxiv.org/pdf/1704.08579.pdf

* [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030

* [5] https://bertdobbelaere.github.io/sorting_networks.html
242 changes: 216 additions & 26 deletions src/avx512-16bit-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#define AVX512_16BIT_COMMON

#include "avx512-common-qsort.h"
#include "xss-network-qsort.hpp"

/*
* Constants used in sorting 32 elements in a ZMM registers. Based on Bitonic
Expand Down Expand Up @@ -93,30 +92,221 @@ X86_SIMD_SORT_INLINE reg_t sort_zmm_16bit(reg_t zmm)
return zmm;
}

// Assumes zmm is bitonic and performs a recursive half cleaner
template <typename vtype, typename reg_t = typename vtype::reg_t>
X86_SIMD_SORT_INLINE reg_t bitonic_merge_zmm_16bit(reg_t zmm)
{
// 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc ..
zmm = cmp_merge<vtype>(
zmm, vtype::permutexvar(vtype::get_network(6), zmm), 0xFFFF0000);
// 2) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
zmm = cmp_merge<vtype>(
zmm, vtype::permutexvar(vtype::get_network(5), zmm), 0xFF00FF00);
// 3) half_cleaner[8]
zmm = cmp_merge<vtype>(
zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0);
// 3) half_cleaner[4]
zmm = cmp_merge<vtype>(
zmm,
vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
0xCCCCCCCC);
// 3) half_cleaner[2]
zmm = cmp_merge<vtype>(
zmm,
vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
0xAAAAAAAA);
return zmm;
}
struct avx512_16bit_swizzle_ops {
template <typename vtype, int scale>
X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg)
{
__m512i v = vtype::cast_to(reg);

if constexpr (scale == 2) {
__m512i mask = _mm512_set_epi16(30,
31,
28,
29,
26,
27,
24,
25,
22,
23,
20,
21,
18,
19,
16,
17,
14,
15,
12,
13,
10,
11,
8,
9,
6,
7,
4,
5,
2,
3,
0,
1);
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 4) {
v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b10110001);
}
else if constexpr (scale == 8) {
v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b01001110);
}
else if constexpr (scale == 16) {
v = _mm512_shuffle_i64x2(v, v, 0b10110001);
}
else if constexpr (scale == 32) {
v = _mm512_shuffle_i64x2(v, v, 0b01001110);
}
else {
static_assert(scale == -1, "should not be reached");
}

return vtype::cast_from(v);
}

template <typename vtype, int scale>
X86_SIMD_SORT_INLINE typename vtype::reg_t
reverse_n(typename vtype::reg_t reg)
{
__m512i v = vtype::cast_to(reg);

if constexpr (scale == 2) { return swap_n<vtype, 2>(reg); }
else if constexpr (scale == 4) {
__m512i mask = _mm512_set_epi16(28,
29,
30,
31,
24,
25,
26,
27,
20,
21,
22,
23,
16,
17,
18,
19,
12,
13,
14,
15,
8,
9,
10,
11,
4,
5,
6,
7,
0,
1,
2,
3);
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 8) {
__m512i mask = _mm512_set_epi16(24,
25,
26,
27,
28,
29,
30,
31,
16,
17,
18,
19,
20,
21,
22,
23,
8,
9,
10,
11,
12,
13,
14,
15,
0,
1,
2,
3,
4,
5,
6,
7);
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 16) {
__m512i mask = _mm512_set_epi16(16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15);
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 32) {
return vtype::reverse(reg);
}
else {
static_assert(scale == -1, "should not be reached");
}

return vtype::cast_from(v);
}

template <typename vtype, int scale>
X86_SIMD_SORT_INLINE typename vtype::reg_t
merge_n(typename vtype::reg_t reg, typename vtype::reg_t other)
{
__m512i v1 = vtype::cast_to(reg);
__m512i v2 = vtype::cast_to(other);

if constexpr (scale == 2) {
v1 = _mm512_mask_blend_epi16(
0b01010101010101010101010101010101, v1, v2);
}
else if constexpr (scale == 4) {
v1 = _mm512_mask_blend_epi16(
0b00110011001100110011001100110011, v1, v2);
}
else if constexpr (scale == 8) {
v1 = _mm512_mask_blend_epi16(
0b00001111000011110000111100001111, v1, v2);
}
else if constexpr (scale == 16) {
v1 = _mm512_mask_blend_epi16(
0b00000000111111110000000011111111, v1, v2);
}
else if constexpr (scale == 32) {
v1 = _mm512_mask_blend_epi16(
0b00000000000000001111111111111111, v1, v2);
}
else {
static_assert(scale == -1, "should not be reached");
}

return vtype::cast_from(v1);
}
};

#endif // AVX512_16BIT_COMMON
Loading