Skip to content

Commit

Permalink
Merge pull request #68 from r-devulap/gcc-specific
Browse files Browse the repository at this point in the history
Use global Macros for GCC specific keywords
  • Loading branch information
r-devulap authored Sep 5, 2023
2 parents f97b484 + dd79993 commit 94280b1
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 27 deletions.
26 changes: 13 additions & 13 deletions src/avx512-64bit-argsort.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)
zmm_t arrzmm[4];
argzmm_t argzmm[4];

#pragma GCC unroll 2
#pragma X86_SIMD_SORT_UNROLL_LOOP(2)
for (int ii = 0; ii < 2; ++ii) {
argzmm[ii] = argtype::loadu(arg + 8 * ii);
arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
Expand All @@ -117,7 +117,7 @@ X86_SIMD_SORT_INLINE void argsort_32_64bit(type_t *arr, int64_t *arg, int32_t N)

uint64_t combined_mask = (0x1ull << (N - 16)) - 0x1ull;
opmask_t load_mask[2] = {0xFF, 0xFF};
#pragma GCC unroll 2
#pragma X86_SIMD_SORT_UNROLL_LOOP(2)
for (int ii = 0; ii < 2; ++ii) {
load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF;
argzmm[ii + 2] = argtype::maskz_loadu(load_mask[ii], arg + 16 + 8 * ii);
Expand Down Expand Up @@ -151,7 +151,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
zmm_t arrzmm[8];
argzmm_t argzmm[8];

#pragma GCC unroll 4
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
argzmm[ii] = argtype::loadu(arg + 8 * ii);
arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
Expand All @@ -160,7 +160,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)

opmask_t load_mask[4] = {0xFF, 0xFF, 0xFF, 0xFF};
uint64_t combined_mask = (0x1ull << (N - 32)) - 0x1ull;
#pragma GCC unroll 4
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
load_mask[ii] = (combined_mask >> (ii * 8)) & 0xFF;
argzmm[ii + 4] = argtype::maskz_loadu(load_mask[ii], arg + 32 + 8 * ii);
Expand All @@ -170,7 +170,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
argzmm[ii + 4]);
}

#pragma GCC unroll 4
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 8; ii = ii + 2) {
bitonic_merge_two_zmm_64bit<vtype, argtype>(
arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]);
Expand All @@ -179,11 +179,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
bitonic_merge_four_zmm_64bit<vtype, argtype>(arrzmm + 4, argzmm + 4);
bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm, argzmm);

#pragma GCC unroll 4
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
argtype::storeu(arg + 8 * ii, argzmm[ii]);
}
#pragma GCC unroll 4
#pragma X86_SIMD_SORT_UNROLL_LOOP(4)
for (int ii = 0; ii < 4; ++ii) {
argtype::mask_storeu(arg + 32 + 8 * ii, load_mask[ii], argzmm[ii + 4]);
}
Expand All @@ -203,7 +203,7 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
// zmm_t arrzmm[16];
// argzmm_t argzmm[16];
//
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argzmm[ii] = argtype::loadu(arg + 8*ii);
// arrzmm[ii] = vtype::template i64gather<sizeof(type_t)>(argzmm[ii], arr);
Expand All @@ -213,19 +213,19 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
// opmask_t load_mask[8] = {0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF};
// if (N != 128) {
// uint64_t combined_mask = (0x1ull << (N - 64)) - 0x1ull;
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// load_mask[ii] = (combined_mask >> (ii*8)) & 0xFF;
// }
// }
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argzmm[ii+8] = argtype::maskz_loadu(load_mask[ii], arg + 64 + 8*ii);
// arrzmm[ii+8] = vtype::template mask_i64gather<sizeof(type_t)>(vtype::zmm_max(), load_mask[ii], argzmm[ii+8], arr);
// arrzmm[ii+8] = sort_zmm_64bit<vtype, argtype>(arrzmm[ii+8], argzmm[ii+8]);
// }
//
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 16; ii = ii + 2) {
// bitonic_merge_two_zmm_64bit<vtype, argtype>(arrzmm[ii], arrzmm[ii + 1], argzmm[ii], argzmm[ii + 1]);
// }
Expand All @@ -237,11 +237,11 @@ X86_SIMD_SORT_INLINE void argsort_64_64bit(type_t *arr, int64_t *arg, int32_t N)
// bitonic_merge_eight_zmm_64bit<vtype, argtype>(arrzmm+8, argzmm+8);
// bitonic_merge_sixteen_zmm_64bit<vtype, argtype>(arrzmm, argzmm);
//
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argtype::storeu(arg + 8*ii, argzmm[ii]);
// }
//#pragma GCC unroll 8
//#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
// for (int ii = 0; ii < 8; ++ii) {
// argtype::mask_storeu(arg + 64 + 8*ii, load_mask[ii], argzmm[ii + 8]);
// }
Expand Down
12 changes: 6 additions & 6 deletions src/avx512-common-argsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
// first and last vtype::numlanes values are partitioned at the end
zmm_t vec_left[num_unroll], vec_right[num_unroll];
argzmm_t argvec_left[num_unroll], argvec_right[num_unroll];
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
argvec_left[ii] = argtype::loadu(arg + left + vtype::numlanes * ii);
vec_left[ii] = vtype::template i64gather<sizeof(type_t)>(
Expand All @@ -224,7 +224,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
*/
if ((r_store + vtype::numlanes) - right < left - l_store) {
right -= num_unroll * vtype::numlanes;
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
arg_vec[ii]
= argtype::loadu(arg + right + ii * vtype::numlanes);
Expand All @@ -233,7 +233,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
}
}
else {
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
arg_vec[ii] = argtype::loadu(arg + left + ii * vtype::numlanes);
curr_vec[ii] = vtype::template i64gather<sizeof(type_t)>(
Expand All @@ -242,7 +242,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
left += num_unroll * vtype::numlanes;
}
// partition the current vector and save it on both sides of the array
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_gt_pivot
= partition_vec<vtype>(arg,
Expand All @@ -259,7 +259,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
}

/* partition and save vec_left and vec_right */
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_gt_pivot
= partition_vec<vtype>(arg,
Expand All @@ -273,7 +273,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
l_store += (vtype::numlanes - amount_gt_pivot);
r_store -= amount_gt_pivot;
}
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_gt_pivot
= partition_vec<vtype>(arg,
Expand Down
27 changes: 19 additions & 8 deletions src/avx512-common-qsort.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,9 +67,12 @@
#define ZMM_MAX_INT16 _mm512_set1_epi16(X86_SIMD_SORT_MAX_INT16)
#define SHUFFLE_MASK(a, b, c, d) (a << 6) | (b << 4) | (c << 2) | d

/* Compiler specific macros specific */
#ifdef _MSC_VER
#define X86_SIMD_SORT_INLINE static inline
#define X86_SIMD_SORT_FINLINE static __forceinline
#define LIKELY(x)
#define UNLIKELY(x)
#elif defined(__CYGWIN__)
/*
* Force inline in cygwin to work around a compiler bug. See
Expand All @@ -80,13 +83,21 @@
#elif defined(__GNUC__)
#define X86_SIMD_SORT_INLINE static inline
#define X86_SIMD_SORT_FINLINE static __attribute__((always_inline))
#define LIKELY(x) __builtin_expect((x), 1)
#define UNLIKELY(x) __builtin_expect((x), 0)
#else
#define X86_SIMD_SORT_INLINE static
#define X86_SIMD_SORT_FINLINE static
#define LIKELY(x)
#define UNLIKELY(x)
#endif

#define LIKELY(x) __builtin_expect((x), 1)
#define UNLIKELY(x) __builtin_expect((x), 0)
#if __GNUC__ >= 8
#define X86_SIMD_SORT_UNROLL_LOOP(num)\
GCC unroll num
#else
#define X86_SIMD_SORT_UNROLL_LOOP(num)
#endif

template <typename type>
struct zmm_vector;
Expand Down Expand Up @@ -382,7 +393,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
// We will now have atleast 16 registers worth of data to process:
// left and right vtype::numlanes values are partitioned at the end
zmm_t vec_left[num_unroll], vec_right[num_unroll];
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
vec_left[ii] = vtype::loadu(arr + left + vtype::numlanes * ii);
vec_right[ii] = vtype::loadu(
Expand All @@ -403,20 +414,20 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
*/
if ((r_store + vtype::numlanes) - right < left - l_store) {
right -= num_unroll * vtype::numlanes;
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
curr_vec[ii] = vtype::loadu(arr + right + ii * vtype::numlanes);
}
}
else {
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
curr_vec[ii] = vtype::loadu(arr + left + ii * vtype::numlanes);
}
left += num_unroll * vtype::numlanes;
}
// partition the current vector and save it on both sides of the array
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
Expand All @@ -432,7 +443,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
}

/* partition and save vec_left[8] and vec_right[8] */
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
Expand All @@ -445,7 +456,7 @@ static inline int64_t partition_avx512_unrolled(type_t *arr,
l_store += (vtype::numlanes - amount_ge_pivot);
r_store -= amount_ge_pivot;
}
#pragma GCC unroll 8
#pragma X86_SIMD_SORT_UNROLL_LOOP(8)
for (int ii = 0; ii < num_unroll; ++ii) {
int32_t amount_ge_pivot
= partition_vec<vtype>(arr,
Expand Down

0 comments on commit 94280b1

Please sign in to comment.