diff --git a/src/avx2-32bit-half.hpp b/src/avx2-32bit-half.hpp index 9100cbb..9e782bb 100644 --- a/src/avx2-32bit-half.hpp +++ b/src/avx2-32bit-half.hpp @@ -64,6 +64,11 @@ struct avx2_half_vector { { return _mm_set1_epi32(type_max()); } // TODO: this should broadcast bits as is? + static opmask_t knot_opmask(opmask_t x) + { + auto allOnes = seti(-1, -1, -1, -1); + return _mm_xor_si128(x, allOnes); + } static opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); @@ -186,6 +191,10 @@ struct avx2_half_vector { { return v; } + static bool all_false(opmask_t k) + { + return _mm_movemask_ps(_mm_castsi128_ps(k)) == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, @@ -218,6 +227,11 @@ struct avx2_half_vector { { return _mm_set1_epi32(type_max()); } + static opmask_t knot_opmask(opmask_t x) + { + auto allOnes = seti(-1, -1, -1, -1); + return _mm_xor_si128(x, allOnes); + } static opmask_t get_partial_loadmask(uint64_t num_to_read) { auto mask = ((0x1ull << num_to_read) - 0x1ull); @@ -331,6 +345,10 @@ struct avx2_half_vector { { return v; } + static bool all_false(opmask_t k) + { + return _mm_movemask_ps(_mm_castsi128_ps(k)) == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, @@ -363,7 +381,11 @@ struct avx2_half_vector { { return _mm_set1_ps(type_max()); } - + static opmask_t knot_opmask(opmask_t x) + { + auto allOnes = seti(-1, -1, -1, -1); + return _mm_xor_si128(x, allOnes); + } static regi_t seti(int v1, int v2, int v3, int v4) { return _mm_set_epi32(v1, v2, v3, v4); @@ -492,6 +514,10 @@ struct avx2_half_vector { { return _mm_castps_si128(v); } + static bool all_false(opmask_t k) + { + return _mm_movemask_ps(_mm_castsi128_ps(k)) == 0; + } static int double_compressstore(type_t *left_addr, type_t *right_addr, opmask_t k, diff --git a/src/avx512-64bit-common.h b/src/avx512-64bit-common.h index 689c317..14201d1 100644 --- a/src/avx512-64bit-common.h +++ b/src/avx512-64bit-common.h @@ -210,6 +210,10 @@ struct ymm_vector { { return _mm256_castps_si256(v); } + static bool all_false(opmask_t k) + { + return k == 0; + } static reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); @@ -394,6 +398,10 @@ struct ymm_vector { { return v; } + static bool all_false(opmask_t k) + { + return k == 0; + } static reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); @@ -578,6 +586,10 @@ struct ymm_vector { { return v; } + static bool all_false(opmask_t k) + { + return k == 0; + } static reg_t reverse(reg_t ymm) { const __m256i rev_index = _mm256_set_epi32(NETWORK_32BIT_AVX2_2); diff --git a/src/xss-common-keyvaluesort.hpp b/src/xss-common-keyvaluesort.hpp index 88552ce..79b2af7 100644 --- a/src/xss-common-keyvaluesort.hpp +++ b/src/xss-common-keyvaluesort.hpp @@ -72,7 +72,7 @@ X86_SIMD_SORT_INLINE arrsize_t kvpartition(type_t1 *keys, for (int32_t i = (right - left) % vtype1::numlanes; i > 0; --i) { *smallest = std::min(*smallest, keys[left]); *biggest = std::max(*biggest, keys[left]); - if (keys[left] > pivot) { + if (keys[left] >= pivot) { right--; std::swap(keys[left], keys[right]); std::swap(indexes[left], indexes[right]); @@ -204,12 +204,13 @@ X86_SIMD_SORT_INLINE arrsize_t kvpartition_unrolled(type_t1 *keys, return kvpartition( keys, indexes, left, right, pivot, smallest, biggest); } + /* make array length divisible by vtype1::numlanes , shortening the array */ for (int32_t i = ((right - left) % (num_unroll * vtype1::numlanes)); i > 0; --i) { *smallest = std::min(*smallest, keys[left]); *biggest = std::max(*biggest, keys[left]); - if (keys[left] > pivot) { + if (keys[left] >= pivot) { right--; std::swap(keys[left], keys[right]); std::swap(indexes[left], indexes[right]); @@ -386,18 +387,27 @@ X86_SIMD_SORT_INLINE void kvsort_(type1_t *keys, * Base case: use bitonic networks to sort arrays <= 128 */ if (right + 1 - left <= 128) { - kvsort_n( keys + left, indexes + left, (int32_t)(right + 1 - left)); return; } - type1_t pivot = get_pivot_blocks(keys, left, right); + // Ascending comparator for this vtype + using comparator = Comparator; + type1_t pivot; + auto pivot_result + = get_pivot_smart(keys, left, right); + pivot = pivot_result.pivot; + + if (pivot_result.result == pivot_result_t::Sorted) { return; } + type1_t smallest = vtype1::type_max(); type1_t biggest = vtype1::type_min(); arrsize_t pivot_index = kvpartition_unrolled( keys, indexes, left, right + 1, pivot, &smallest, &biggest); + if (pivot_result.result == pivot_result_t::Only2Values) { return; } + #ifdef XSS_COMPILE_OPENMP if (pivot != smallest) { bool parallel_left = (pivot_index - left) > task_threshold; diff --git a/src/xss-pivot-selection.hpp b/src/xss-pivot-selection.hpp index 6ce0b88..c09dfc6 100644 --- a/src/xss-pivot-selection.hpp +++ b/src/xss-pivot-selection.hpp @@ -148,12 +148,7 @@ get_pivot_smart(type_t *arr, const arrsize_t left, const arrsize_t right) return pivot_results( comparator::choosePivotMedianIsLargest(median)); } - else { - // Should be unreachable - return pivot_results(median); - } - // Should be unreachable return pivot_results(median); } diff --git a/utils/rand_array.h b/utils/rand_array.h index cb99da2..dccbacd 100644 --- a/utils/rand_array.h +++ b/utils/rand_array.h @@ -136,8 +136,8 @@ static std::vector get_array(std::string arrtype, else { val = std::numeric_limits::max(); } - for (size_t ii = 1; ii <= arrsize; ++ii) { - if (rand() % 0x1) { arr[ii] = val; } + for (size_t ii = 0; ii < arrsize; ++ii) { + if (rand() & 0x1) { arr[ii] = val; } } } else {