Skip to content

Commit

Permalink
Merge pull request #83 from sterrettm2/perfwork
Browse files Browse the repository at this point in the history
Various performance improvements
  • Loading branch information
r-devulap authored Oct 13, 2023
2 parents a0eef89 + 68e5393 commit 0fad81f
Show file tree
Hide file tree
Showing 16 changed files with 1,266 additions and 395 deletions.
31 changes: 31 additions & 0 deletions .github/workflows/c-cpp.yml
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,34 @@ jobs:
- name: Run test suite on SPR
run: sde -spr -- ./builddir/testexe

SPR-gcc13-min-networksort:

runs-on: intel-ubuntu-latest

steps:
- uses: actions/checkout@v3

- name: Install dependencies
run: |
sudo apt update
sudo apt -y install g++-13 libgtest-dev meson curl git cmake
- name: Install Intel SDE
run: |
curl -o /tmp/sde.tar.xz https://downloadmirror.intel.com/784319/sde-external-9.24.0-2023-07-13-lin.tar.xz
mkdir /tmp/sde && tar -xvf /tmp/sde.tar.xz -C /tmp/sde/
sudo mv /tmp/sde/* /opt/sde && sudo ln -s /opt/sde/sde64 /usr/bin/sde
- name: Build
env:
CXX: g++-13
CXXFLAGS: -DXSS_MINIMAL_NETWORK_SORT
run: |
make clean
meson setup --warnlevel 2 --werror --buildtype release builddir
cd builddir
ninja
- name: Run test suite on SPR
run: sde -spr -- ./builddir/testexe
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# When unset, discover g++. Prioritise the latest version on the path.
ifeq (, $(and $(strip $(CXX)), $(filter-out default undefined, $(origin CXX))))
override CXX := $(shell which g++-12 g++-11 g++-10 g++-9 g++-8 g++ 2>/dev/null | head -n 1)
override CXX := $(shell which g++-13 g++-12 g++-11 g++-10 g++-9 g++-8 g++ 2>/dev/null | head -n 1)
ifeq (, $(strip $(CXX)))
$(error Could not locate the g++ compiler. Please manually specify its path using the CXX variable)
endif
Expand Down
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -170,3 +170,4 @@ Skylake https://arxiv.org/pdf/1704.08579.pdf

* [4] http://mitp-content-server.mit.edu:18180/books/content/sectbyfn?collid=books_pres_0&fn=Chapter%2027.pdf&id=8030

* [5] https://bertdobbelaere.github.io/sorting_networks.html
242 changes: 216 additions & 26 deletions src/avx512-16bit-common.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
#define AVX512_16BIT_COMMON

#include "avx512-common-qsort.h"
#include "xss-network-qsort.hpp"

/*
* Constants used in sorting 32 elements in a ZMM registers. Based on Bitonic
Expand Down Expand Up @@ -93,30 +92,221 @@ X86_SIMD_SORT_INLINE reg_t sort_zmm_16bit(reg_t zmm)
return zmm;
}

// Assumes zmm is bitonic and performs a recursive half cleaner
template <typename vtype, typename reg_t = typename vtype::reg_t>
X86_SIMD_SORT_INLINE reg_t bitonic_merge_zmm_16bit(reg_t zmm)
{
// 1) half_cleaner[32]: compare 1-17, 2-18, 3-19 etc ..
zmm = cmp_merge<vtype>(
zmm, vtype::permutexvar(vtype::get_network(6), zmm), 0xFFFF0000);
// 2) half_cleaner[16]: compare 1-9, 2-10, 3-11 etc ..
zmm = cmp_merge<vtype>(
zmm, vtype::permutexvar(vtype::get_network(5), zmm), 0xFF00FF00);
// 3) half_cleaner[8]
zmm = cmp_merge<vtype>(
zmm, vtype::permutexvar(vtype::get_network(3), zmm), 0xF0F0F0F0);
// 3) half_cleaner[4]
zmm = cmp_merge<vtype>(
zmm,
vtype::template shuffle<SHUFFLE_MASK(1, 0, 3, 2)>(zmm),
0xCCCCCCCC);
// 3) half_cleaner[2]
zmm = cmp_merge<vtype>(
zmm,
vtype::template shuffle<SHUFFLE_MASK(2, 3, 0, 1)>(zmm),
0xAAAAAAAA);
return zmm;
}
struct avx512_16bit_swizzle_ops {
template <typename vtype, int scale>
X86_SIMD_SORT_INLINE typename vtype::reg_t swap_n(typename vtype::reg_t reg)
{
__m512i v = vtype::cast_to(reg);

if constexpr (scale == 2) {
__m512i mask = _mm512_set_epi16(30,
31,
28,
29,
26,
27,
24,
25,
22,
23,
20,
21,
18,
19,
16,
17,
14,
15,
12,
13,
10,
11,
8,
9,
6,
7,
4,
5,
2,
3,
0,
1);
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 4) {
v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b10110001);
}
else if constexpr (scale == 8) {
v = _mm512_shuffle_epi32(v, (_MM_PERM_ENUM)0b01001110);
}
else if constexpr (scale == 16) {
v = _mm512_shuffle_i64x2(v, v, 0b10110001);
}
else if constexpr (scale == 32) {
v = _mm512_shuffle_i64x2(v, v, 0b01001110);
}
else {
static_assert(scale == -1, "should not be reached");
}

return vtype::cast_from(v);
}

template <typename vtype, int scale>
X86_SIMD_SORT_INLINE typename vtype::reg_t
reverse_n(typename vtype::reg_t reg)
{
__m512i v = vtype::cast_to(reg);

if constexpr (scale == 2) { return swap_n<vtype, 2>(reg); }
else if constexpr (scale == 4) {
__m512i mask = _mm512_set_epi16(28,
29,
30,
31,
24,
25,
26,
27,
20,
21,
22,
23,
16,
17,
18,
19,
12,
13,
14,
15,
8,
9,
10,
11,
4,
5,
6,
7,
0,
1,
2,
3);
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 8) {
__m512i mask = _mm512_set_epi16(24,
25,
26,
27,
28,
29,
30,
31,
16,
17,
18,
19,
20,
21,
22,
23,
8,
9,
10,
11,
12,
13,
14,
15,
0,
1,
2,
3,
4,
5,
6,
7);
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 16) {
__m512i mask = _mm512_set_epi16(16,
17,
18,
19,
20,
21,
22,
23,
24,
25,
26,
27,
28,
29,
30,
31,
0,
1,
2,
3,
4,
5,
6,
7,
8,
9,
10,
11,
12,
13,
14,
15);
v = _mm512_permutexvar_epi16(mask, v);
}
else if constexpr (scale == 32) {
return vtype::reverse(reg);
}
else {
static_assert(scale == -1, "should not be reached");
}

return vtype::cast_from(v);
}

template <typename vtype, int scale>
X86_SIMD_SORT_INLINE typename vtype::reg_t
merge_n(typename vtype::reg_t reg, typename vtype::reg_t other)
{
__m512i v1 = vtype::cast_to(reg);
__m512i v2 = vtype::cast_to(other);

if constexpr (scale == 2) {
v1 = _mm512_mask_blend_epi16(
0b01010101010101010101010101010101, v1, v2);
}
else if constexpr (scale == 4) {
v1 = _mm512_mask_blend_epi16(
0b00110011001100110011001100110011, v1, v2);
}
else if constexpr (scale == 8) {
v1 = _mm512_mask_blend_epi16(
0b00001111000011110000111100001111, v1, v2);
}
else if constexpr (scale == 16) {
v1 = _mm512_mask_blend_epi16(
0b00000000111111110000000011111111, v1, v2);
}
else if constexpr (scale == 32) {
v1 = _mm512_mask_blend_epi16(
0b00000000000000001111111111111111, v1, v2);
}
else {
static_assert(scale == -1, "should not be reached");
}

return vtype::cast_from(v1);
}
};

#endif // AVX512_16BIT_COMMON
Loading

0 comments on commit 0fad81f

Please sign in to comment.