Skip to content

Commit

Permalink
Merge pull request #714 from argilo/remove-sse-32
Browse files Browse the repository at this point in the history
Remove broken sse_32 protokernels
  • Loading branch information
jdemel authored Dec 10, 2023
2 parents dde2d5a + 874d95c commit 523706f
Show file tree
Hide file tree
Showing 2 changed files with 0 additions and 257 deletions.
131 changes: 0 additions & 131 deletions kernels/volk/volk_32fc_x2_conjugate_dot_prod_32fc.h
Original file line number Diff line number Diff line change
Expand Up @@ -567,136 +567,5 @@ static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse(lv_32fc_t* result,
}
#endif

#if LV_HAVE_SSE && LV_HAVE_32
static inline void volk_32fc_x2_conjugate_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points)
{

const unsigned int num_bytes = num_points * 8;

__VOLK_ATTR_ALIGNED(16)
static const uint32_t conjugator[4] = {
0x00000000, 0x80000000, 0x00000000, 0x80000000
};

int bound = num_bytes >> 4;
int leftovers = num_bytes % 16;

__VOLK_ASM __VOLK_VOLATILE(
" #pushl %%ebp\n\t"
" #movl %%esp, %%ebp\n\t"
" #movl 12(%%ebp), %%eax # input\n\t"
" #movl 16(%%ebp), %%edx # taps\n\t"
" #movl 20(%%ebp), %%ecx # n_bytes\n\t"
" movaps 0(%[conjugator]), %%xmm1\n\t"
" xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
" movaps 0(%[eax]), %%xmm0\n\t"
" xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
" movaps 0(%[edx]), %%xmm2\n\t"
" movl %[ecx], (%[out])\n\t"
" shrl $5, %[ecx] # ecx = n_2_ccomplex_blocks / 2\n\t"

" xorps %%xmm1, %%xmm2\n\t"
" jmp .%=L1_test\n\t"
" # 4 taps / loop\n\t"
" # something like ?? cycles / loop\n\t"
".%=Loop1: \n\t"
"# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
"# movaps (%[eax]), %%xmmA\n\t"
"# movaps (%[edx]), %%xmmB\n\t"
"# movaps %%xmmA, %%xmmZ\n\t"
"# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
"# mulps %%xmmB, %%xmmA\n\t"
"# mulps %%xmmZ, %%xmmB\n\t"
"# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
"# xorps %%xmmPN, %%xmmA\n\t"
"# movaps %%xmmA, %%xmmZ\n\t"
"# unpcklps %%xmmB, %%xmmA\n\t"
"# unpckhps %%xmmB, %%xmmZ\n\t"
"# movaps %%xmmZ, %%xmmY\n\t"
"# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
"# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
"# addps %%xmmZ, %%xmmA\n\t"
"# addps %%xmmA, %%xmmC\n\t"
"# A=xmm0, B=xmm2, Z=xmm4\n\t"
"# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
" movaps 16(%[edx]), %%xmm3\n\t"
" movaps %%xmm0, %%xmm4\n\t"
" xorps %%xmm1, %%xmm3\n\t"
" mulps %%xmm2, %%xmm0\n\t"
" movaps 16(%[eax]), %%xmm1\n\t"
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
" movaps %%xmm1, %%xmm5\n\t"
" addps %%xmm0, %%xmm6\n\t"
" mulps %%xmm3, %%xmm1\n\t"
" shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
" addps %%xmm1, %%xmm6\n\t"
" movaps 0(%[conjugator]), %%xmm1\n\t"
" mulps %%xmm4, %%xmm2\n\t"
" movaps 32(%[eax]), %%xmm0\n\t"
" addps %%xmm2, %%xmm7\n\t"
" mulps %%xmm5, %%xmm3\n\t"
" addl $32, %[eax]\n\t"
" movaps 32(%[edx]), %%xmm2\n\t"
" addps %%xmm3, %%xmm7\n\t"
" xorps %%xmm1, %%xmm2\n\t"
" addl $32, %[edx]\n\t"
".%=L1_test:\n\t"
" decl %[ecx]\n\t"
" jge .%=Loop1\n\t"
" # We've handled the bulk of multiplies up to here.\n\t"
" # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
" # If so, we've got 2 more taps to do.\n\t"
" movl 0(%[out]), %[ecx] # n_2_ccomplex_blocks\n\t"
" shrl $4, %[ecx]\n\t"
" andl $1, %[ecx]\n\t"
" je .%=Leven\n\t"
" # The count was odd, do 2 more taps.\n\t"
" # Note that we've already got mm0/mm2 preloaded\n\t"
" # from the main loop.\n\t"
" movaps %%xmm0, %%xmm4\n\t"
" mulps %%xmm2, %%xmm0\n\t"
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
" addps %%xmm0, %%xmm6\n\t"
" mulps %%xmm4, %%xmm2\n\t"
" addps %%xmm2, %%xmm7\n\t"
".%=Leven:\n\t"
" # neg inversor\n\t"
" #movl 8(%%ebp), %[eax] \n\t"
" xorps %%xmm1, %%xmm1\n\t"
" movl $0x80000000, (%[out])\n\t"
" movss (%[out]), %%xmm1\n\t"
" shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
" # pfpnacc\n\t"
" xorps %%xmm1, %%xmm6\n\t"
" movaps %%xmm6, %%xmm2\n\t"
" unpcklps %%xmm7, %%xmm6\n\t"
" unpckhps %%xmm7, %%xmm2\n\t"
" movaps %%xmm2, %%xmm3\n\t"
" shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
" shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
" addps %%xmm2, %%xmm6\n\t"
" # xmm6 = r1 i2 r3 i4\n\t"
" #movl 8(%%ebp), %[eax] # @result\n\t"
" movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
" addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
" movlps %%xmm6, (%[out]) # store low 2x32 bits (complex) "
"to memory\n\t"
" #popl %%ebp\n\t"
:
: [eax] "r"(input),
[edx] "r"(taps),
[ecx] "r"(num_bytes),
[out] "r"(result),
[conjugator] "r"(conjugator));

for (; leftovers > 0; leftovers -= 8) {
*result += (input[(bound << 1)] * lv_conj(taps[(bound << 1)]));
}
}
#endif /*LV_HAVE_SSE*/


#endif /*INCLUDED_volk_32fc_x2_conjugate_dot_prod_32fc_a_H*/
126 changes: 0 additions & 126 deletions kernels/volk/volk_32fc_x2_dot_prod_32fc.h
Original file line number Diff line number Diff line change
Expand Up @@ -651,132 +651,6 @@ static inline void volk_32fc_x2_dot_prod_32fc_a_sse_64(lv_32fc_t* result,

#endif

#if LV_HAVE_SSE && LV_HAVE_32

static inline void volk_32fc_x2_dot_prod_32fc_a_sse_32(lv_32fc_t* result,
const lv_32fc_t* input,
const lv_32fc_t* taps,
unsigned int num_points)
{

volk_32fc_x2_dot_prod_32fc_generic(result, input, taps, num_points);

#if 0
const unsigned int num_bytes = num_points*8;
unsigned int isodd = num_points & 1;

__VOLK_ASM __VOLK_VOLATILE
(
" #pushl %%ebp\n\t"
" #movl %%esp, %%ebp\n\t"
" movl 12(%%ebp), %%eax # input\n\t"
" movl 16(%%ebp), %%edx # taps\n\t"
" movl 20(%%ebp), %%ecx # n_bytes\n\t"
" xorps %%xmm6, %%xmm6 # zero accumulators\n\t"
" movaps 0(%%eax), %%xmm0\n\t"
" xorps %%xmm7, %%xmm7 # zero accumulators\n\t"
" movaps 0(%%edx), %%xmm2\n\t"
" shrl $5, %%ecx # ecx = n_2_ccomplex_blocks / 2\n\t"
" jmp .%=L1_test\n\t"
" # 4 taps / loop\n\t"
" # something like ?? cycles / loop\n\t"
".%=Loop1: \n\t"
"# complex prod: C += A * B, w/ temp Z & Y (or B), xmmPN=$0x8000000080000000\n\t"
"# movaps (%%eax), %%xmmA\n\t"
"# movaps (%%edx), %%xmmB\n\t"
"# movaps %%xmmA, %%xmmZ\n\t"
"# shufps $0xb1, %%xmmZ, %%xmmZ # swap internals\n\t"
"# mulps %%xmmB, %%xmmA\n\t"
"# mulps %%xmmZ, %%xmmB\n\t"
"# # SSE replacement for: pfpnacc %%xmmB, %%xmmA\n\t"
"# xorps %%xmmPN, %%xmmA\n\t"
"# movaps %%xmmA, %%xmmZ\n\t"
"# unpcklps %%xmmB, %%xmmA\n\t"
"# unpckhps %%xmmB, %%xmmZ\n\t"
"# movaps %%xmmZ, %%xmmY\n\t"
"# shufps $0x44, %%xmmA, %%xmmZ # b01000100\n\t"
"# shufps $0xee, %%xmmY, %%xmmA # b11101110\n\t"
"# addps %%xmmZ, %%xmmA\n\t"
"# addps %%xmmA, %%xmmC\n\t"
"# A=xmm0, B=xmm2, Z=xmm4\n\t"
"# A'=xmm1, B'=xmm3, Z'=xmm5\n\t"
" movaps 16(%%eax), %%xmm1\n\t"
" movaps %%xmm0, %%xmm4\n\t"
" mulps %%xmm2, %%xmm0\n\t"
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
" movaps 16(%%edx), %%xmm3\n\t"
" movaps %%xmm1, %%xmm5\n\t"
" addps %%xmm0, %%xmm6\n\t"
" mulps %%xmm3, %%xmm1\n\t"
" shufps $0xb1, %%xmm5, %%xmm5 # swap internals\n\t"
" addps %%xmm1, %%xmm6\n\t"
" mulps %%xmm4, %%xmm2\n\t"
" movaps 32(%%eax), %%xmm0\n\t"
" addps %%xmm2, %%xmm7\n\t"
" mulps %%xmm5, %%xmm3\n\t"
" addl $32, %%eax\n\t"
" movaps 32(%%edx), %%xmm2\n\t"
" addps %%xmm3, %%xmm7\n\t"
" addl $32, %%edx\n\t"
".%=L1_test:\n\t"
" decl %%ecx\n\t"
" jge .%=Loop1\n\t"
" # We've handled the bulk of multiplies up to here.\n\t"
" # Let's sse if original n_2_ccomplex_blocks was odd.\n\t"
" # If so, we've got 2 more taps to do.\n\t"
" movl 20(%%ebp), %%ecx # n_2_ccomplex_blocks\n\t"
" shrl $4, %%ecx\n\t"
" andl $1, %%ecx\n\t"
" je .%=Leven\n\t"
" # The count was odd, do 2 more taps.\n\t"
" # Note that we've already got mm0/mm2 preloaded\n\t"
" # from the main loop.\n\t"
" movaps %%xmm0, %%xmm4\n\t"
" mulps %%xmm2, %%xmm0\n\t"
" shufps $0xb1, %%xmm4, %%xmm4 # swap internals\n\t"
" addps %%xmm0, %%xmm6\n\t"
" mulps %%xmm4, %%xmm2\n\t"
" addps %%xmm2, %%xmm7\n\t"
".%=Leven:\n\t"
" # neg inversor\n\t"
" movl 8(%%ebp), %%eax \n\t"
" xorps %%xmm1, %%xmm1\n\t"
" movl $0x80000000, (%%eax)\n\t"
" movss (%%eax), %%xmm1\n\t"
" shufps $0x11, %%xmm1, %%xmm1 # b00010001 # 0 -0 0 -0\n\t"
" # pfpnacc\n\t"
" xorps %%xmm1, %%xmm6\n\t"
" movaps %%xmm6, %%xmm2\n\t"
" unpcklps %%xmm7, %%xmm6\n\t"
" unpckhps %%xmm7, %%xmm2\n\t"
" movaps %%xmm2, %%xmm3\n\t"
" shufps $0x44, %%xmm6, %%xmm2 # b01000100\n\t"
" shufps $0xee, %%xmm3, %%xmm6 # b11101110\n\t"
" addps %%xmm2, %%xmm6\n\t"
" # xmm6 = r1 i2 r3 i4\n\t"
" #movl 8(%%ebp), %%eax # @result\n\t"
" movhlps %%xmm6, %%xmm4 # xmm4 = r3 i4 ?? ??\n\t"
" addps %%xmm4, %%xmm6 # xmm6 = r1+r3 i2+i4 ?? ??\n\t"
" movlps %%xmm6, (%%eax) # store low 2x32 bits (complex) to memory\n\t"
" #popl %%ebp\n\t"
:
:
: "eax", "ecx", "edx"
);


int getem = num_bytes % 16;

if(isodd) {
*result += (input[num_points - 1] * taps[num_points - 1]);
}

return;
#endif
}

#endif /*LV_HAVE_SSE*/

#ifdef LV_HAVE_SSE3

#include <pmmintrin.h>
Expand Down

0 comments on commit 523706f

Please sign in to comment.