From fceccc3ff76678b1e6a99381f49f18656436010c Mon Sep 17 00:00:00 2001 From: Raghuveer Devulapalli Date: Fri, 13 Oct 2023 11:10:31 -0700 Subject: [PATCH] Add comments --- src/xss-network-qsort.hpp | 25 +++++++++++++++++++++++-- src/xss-optimal-networks.hpp | 2 +- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/xss-network-qsort.hpp b/src/xss-network-qsort.hpp index 178ed38c..a768a580 100644 --- a/src/xss-network-qsort.hpp +++ b/src/xss-network-qsort.hpp @@ -30,6 +30,25 @@ X86_SIMD_SORT_FINLINE void bitonic_sort_n_vec(reg_t *regs) } } +/* + * Swizzle ops explained: + * swap_n: swap neighbouring blocks of size within block of size + * reg i = [7,6,5,4,3,2,1,0] + * swap_n<2>: = [[6,7],[4,5],[2,3],[0,1]] + * swap_n<4>: = [[5,4,7,6],[1,0,3,2]] + * swap_n<8>: = [[3,2,1,0,7,6,5,4]] + * reverse_n: reverse elements within block of size + * reg i = [7,6,5,4,3,2,1,0] + * rev_n<2>: = [[6,7],[4,5],[2,3],[0,1]] + * rev_n<4>: = [[4,5,6,7],[0,1,2,3]] + * rev_n<8>: = [[0,1,2,3,4,5,6,7]] + * merge_n: merge blocks of elements from two regs + * reg b,a = [a,a,a,a,a,a,a,a], [b,b,b,b,b,b,b,b] + * merge_n<2> = [a,b,a,b,a,b,a,b] + * merge_n<4> = [a,a,b,b,a,a,b,b] + * merge_n<8> = [a,a,a,a,b,b,b,b] + */ + template X86_SIMD_SORT_FINLINE void internal_merge_n_vec(typename vtype::reg_t *reg) { @@ -155,10 +174,12 @@ X86_SIMD_SORT_INLINE void sort_n_vec(typename vtype::type_t *arr, int N) vtype::zmm_max(), ioMasks[j], arr + i * vtype::numlanes); } - // Run the initial sorting network + /* Run the initial sorting network to sort the columns of the [numVecs x + * num_lanes] matrix + */ bitonic_sort_n_vec(vecs); - // Merge vectors together + // Merge the vectors using bitonic merging networks merge_n_vec(vecs); // Unmasked part of the store diff --git a/src/xss-optimal-networks.hpp b/src/xss-optimal-networks.hpp index a136b4f3..3dfa5281 100644 --- a/src/xss-optimal-networks.hpp +++ b/src/xss-optimal-networks.hpp @@ -317,4 +317,4 @@ X86_SIMD_SORT_FINLINE void optimal_sort_32(reg_t *vecs) COEX(vecs[23], vecs[24]); COEX(vecs[25], vecs[26]); COEX(vecs[27], vecs[28]); -} \ No newline at end of file +}