Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update implementations from upstream glibc #9

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ add_library(memcpy STATIC
impls/memmove-avx-unaligned-erms.s
impls/memmove-avx512-no-vzeroupper.s
impls/memmove-avx512-unaligned-erms.s
impls/memmove-erms.s
impls/memmove-evex-unaligned-erms.s
impls/memmove-sse2-unaligned-erms.s
impls/memmove-ssse3-back.s
Expand Down
40 changes: 21 additions & 19 deletions impls/memmove-avx-unaligned-erms-rtm.s
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ __memmove_avx_unaligned_rtm:
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, -32(%rdi,%rdx)

jmp .Lreturn
jmp .Lreturn_vzeroupper

.globl __mempcpy_avx_unaligned_erms_rtm
__mempcpy_avx_unaligned_erms_rtm:
Expand All @@ -45,9 +45,9 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu -32(%rsi, %rdx), %ymm1
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, -32(%rdi, %rdx)
.Lreturn:
.Lreturn_vzeroupper:

xtest; jz 1f; vzeroall; ret; 1: vzeroupper; ret
xtest; jnz 1f; vzeroupper; ret; 1: vzeroall; ret
.p2align 4
.Lless_vec:
cmpl $16, %edx
Expand Down Expand Up @@ -105,7 +105,7 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm1, 32(%rdi)
vmovdqu %ymm2, -32(%rdi, %rdx)
vmovdqu %ymm3, -(32 * 2)(%rdi, %rdx)
jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 4

Expand Down Expand Up @@ -136,7 +136,7 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm5, -(32 * 2)(%rdi, %rdx)
vmovdqu %ymm6, -(32 * 3)(%rdi, %rdx)
vmovdqu %ymm7, -(32 * 4)(%rdi, %rdx)
jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 4,, 4
.Lmore_8x_vec:
Expand Down Expand Up @@ -205,7 +205,7 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm0, (%rcx)

.Lnop_backward:
jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 4,, 8
.Lmore_8x_vec_backward_check_nop:
Expand Down Expand Up @@ -249,7 +249,7 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm7, (32 * 3)(%rdi)

vmovdqu %ymm8, -32(%rdx, %rdi)
jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 5,, 16

Expand Down Expand Up @@ -277,7 +277,7 @@ __memmove_avx_unaligned_erms_rtm:

vmovdqu %ymm1, 32(%r8)

jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 4,, 12
.Lmovsb:
Expand All @@ -293,7 +293,8 @@ __memmove_avx_unaligned_erms_rtm:
cmp __x86_rep_movsb_stop_threshold(%rip), %rdx
jae .Llarge_memcpy_2x_check

testl $(1 << 0), __x86_string_control(%rip)
testb $(1 << 0), __x86_string_control(%rip)

jz .Lskip_short_movsb_check
cmpl $-64, %ecx
ja .Lmore_8x_vec_forward
Expand All @@ -319,13 +320,15 @@ __memmove_avx_unaligned_erms_rtm:

vmovdqu %ymm1, 32(%r8)

jmp .Lreturn
jmp .Lreturn_vzeroupper
.p2align 4,, 10

.Llarge_memcpy_2x_check:
cmp __x86_rep_movsb_threshold(%rip), %rdx
jb .Lmore_8x_vec_check

.Llarge_memcpy_2x:
mov __x86_shared_non_temporal_threshold(%rip), %r11
cmp %r11, %rdx
jb .Lmore_8x_vec_check

negq %rcx
cmpq %rcx, %rdx
Expand All @@ -349,17 +352,17 @@ __memmove_avx_unaligned_erms_rtm:
addq %r8, %rdx

notl %ecx
movq %rdx, %r10
testl $(4096 - 32 * 8), %ecx
jz .Llarge_memcpy_4x

movq %rdx, %r10
shrq $4, %r10
cmp __x86_shared_non_temporal_threshold(%rip), %r10
shlq $4, %r11
cmp %r11, %rdx
jae .Llarge_memcpy_4x

andl $(4096 * 2 - 1), %edx

shrq $((12 + 1) - 4), %r10
shrq $(12 + 1), %r10

.p2align 4
.Lloop_large_memcpy_2x_outer:
Expand Down Expand Up @@ -418,11 +421,10 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm1, -(32 * 3)(%rdi, %rdx)
vmovdqu %ymm2, -(32 * 2)(%rdi, %rdx)
vmovdqu %ymm3, -32(%rdi, %rdx)
jmp .Lreturn
jmp .Lreturn_vzeroupper

.p2align 4
.Llarge_memcpy_4x:
movq %rdx, %r10

andl $(4096 * 4 - 1), %edx

Expand Down Expand Up @@ -490,7 +492,7 @@ __memmove_avx_unaligned_erms_rtm:
vmovdqu %ymm1, -(32 * 3)(%rdi, %rdx)
vmovdqu %ymm2, -(32 * 2)(%rdi, %rdx)
vmovdqu %ymm3, -32(%rdi, %rdx)
jmp .Lreturn
jmp .Lreturn_vzeroupper

.globl __memcpy_avx_unaligned_erms_rtm
.set __memcpy_avx_unaligned_erms_rtm, __memmove_avx_unaligned_erms_rtm
Expand Down
20 changes: 11 additions & 9 deletions impls/memmove-avx-unaligned-erms.s
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ __memmove_avx_unaligned_erms:
vmovdqu -32(%rsi, %rdx), %ymm1
vmovdqu %ymm0, (%rdi)
vmovdqu %ymm1, -32(%rdi, %rdx)
.Lreturn:
.Lreturn_vzeroupper:

vzeroupper; ret
.p2align 4
Expand Down Expand Up @@ -293,7 +293,8 @@ __memmove_avx_unaligned_erms:
cmp __x86_rep_movsb_stop_threshold(%rip), %rdx
jae .Llarge_memcpy_2x_check

testl $(1 << 0), __x86_string_control(%rip)
testb $(1 << 0), __x86_string_control(%rip)

jz .Lskip_short_movsb_check
cmpl $-64, %ecx
ja .Lmore_8x_vec_forward
Expand Down Expand Up @@ -323,9 +324,11 @@ __memmove_avx_unaligned_erms:
.p2align 4,, 10

.Llarge_memcpy_2x_check:
cmp __x86_rep_movsb_threshold(%rip), %rdx
jb .Lmore_8x_vec_check

.Llarge_memcpy_2x:
mov __x86_shared_non_temporal_threshold(%rip), %r11
cmp %r11, %rdx
jb .Lmore_8x_vec_check

negq %rcx
cmpq %rcx, %rdx
Expand All @@ -349,17 +352,17 @@ __memmove_avx_unaligned_erms:
addq %r8, %rdx

notl %ecx
movq %rdx, %r10
testl $(4096 - 32 * 8), %ecx
jz .Llarge_memcpy_4x

movq %rdx, %r10
shrq $4, %r10
cmp __x86_shared_non_temporal_threshold(%rip), %r10
shlq $4, %r11
cmp %r11, %rdx
jae .Llarge_memcpy_4x

andl $(4096 * 2 - 1), %edx

shrq $((12 + 1) - 4), %r10
shrq $(12 + 1), %r10

.p2align 4
.Lloop_large_memcpy_2x_outer:
Expand Down Expand Up @@ -422,7 +425,6 @@ __memmove_avx_unaligned_erms:

.p2align 4
.Llarge_memcpy_4x:
movq %rdx, %r10

andl $(4096 * 4 - 1), %edx

Expand Down
20 changes: 11 additions & 9 deletions impls/memmove-avx512-unaligned-erms.s
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ __memmove_avx512_unaligned_erms:
vmovdqu64 -64(%rsi, %rdx), %zmm17
vmovdqu64 %zmm16, (%rdi)
vmovdqu64 %zmm17, -64(%rdi, %rdx)
.Lreturn:
.Lreturn_vzeroupper:

; ret
.p2align 4,, 8
Expand Down Expand Up @@ -301,7 +301,8 @@ __memmove_avx512_unaligned_erms:
cmp __x86_rep_movsb_stop_threshold(%rip), %rdx
jae .Llarge_memcpy_2x_check

testl $(1 << 0), __x86_string_control(%rip)
testb $(1 << 0), __x86_string_control(%rip)

jz .Lskip_short_movsb_check
cmpl $-64, %ecx
ja .Lmore_8x_vec_forward
Expand All @@ -326,9 +327,11 @@ __memmove_avx512_unaligned_erms:
.p2align 4,, 10

.Llarge_memcpy_2x_check:
cmp __x86_rep_movsb_threshold(%rip), %rdx
jb .Lmore_8x_vec_check

.Llarge_memcpy_2x:
mov __x86_shared_non_temporal_threshold(%rip), %r11
cmp %r11, %rdx
jb .Lmore_8x_vec_check

negq %rcx
cmpq %rcx, %rdx
Expand All @@ -346,17 +349,17 @@ __memmove_avx512_unaligned_erms:
addq %r8, %rdx

notl %ecx
movq %rdx, %r10
testl $(4096 - 64 * 8), %ecx
jz .Llarge_memcpy_4x

movq %rdx, %r10
shrq $4, %r10
cmp __x86_shared_non_temporal_threshold(%rip), %r10
shlq $4, %r11
cmp %r11, %rdx
jae .Llarge_memcpy_4x

andl $(4096 * 2 - 1), %edx

shrq $((12 + 1) - 4), %r10
shrq $(12 + 1), %r10

.p2align 4
.Lloop_large_memcpy_2x_outer:
Expand Down Expand Up @@ -419,7 +422,6 @@ __memmove_avx512_unaligned_erms:

.p2align 4
.Llarge_memcpy_4x:
movq %rdx, %r10

andl $(4096 * 4 - 1), %edx

Expand Down
38 changes: 38 additions & 0 deletions impls/memmove-erms.s
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
.text
.globl __mempcpy_erms
__mempcpy_erms:
mov %rdi, %rax

test %rdx, %rdx
jz 2f
add %rdx, %rax
jmp .Lstart_movsb

.globl __memmove_erms
__memmove_erms:
movq %rdi, %rax

test %rdx, %rdx
jz 2f
.Lstart_movsb:
mov %rdx, %rcx
cmp %rsi, %rdi
jb 1f

je 2f
lea (%rsi,%rcx), %rdx
cmp %rdx, %rdi
jb .Lmovsb_backward
1:
rep movsb
2:
ret
.Lmovsb_backward:
leaq -1(%rdi,%rcx), %rdi
leaq -1(%rsi,%rcx), %rsi
std
rep movsb
cld
ret
.globl __memcpy_erms
.set __memcpy_erms, __memmove_erms
20 changes: 11 additions & 9 deletions impls/memmove-evex-unaligned-erms.s
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ __memmove_evex_unaligned_erms:
vmovdqu64 -32(%rsi, %rdx), %ymm17
vmovdqu64 %ymm16, (%rdi)
vmovdqu64 %ymm17, -32(%rdi, %rdx)
.Lreturn:
.Lreturn_vzeroupper:

; ret
.p2align 4,, 8
Expand Down Expand Up @@ -292,7 +292,8 @@ __memmove_evex_unaligned_erms:
cmp __x86_rep_movsb_stop_threshold(%rip), %rdx
jae .Llarge_memcpy_2x_check

testl $(1 << 0), __x86_string_control(%rip)
testb $(1 << 0), __x86_string_control(%rip)

jz .Lskip_short_movsb_check
cmpl $-64, %ecx
ja .Lmore_8x_vec_forward
Expand Down Expand Up @@ -322,9 +323,11 @@ __memmove_evex_unaligned_erms:
.p2align 4,, 10

.Llarge_memcpy_2x_check:
cmp __x86_rep_movsb_threshold(%rip), %rdx
jb .Lmore_8x_vec_check

.Llarge_memcpy_2x:
mov __x86_shared_non_temporal_threshold(%rip), %r11
cmp %r11, %rdx
jb .Lmore_8x_vec_check

negq %rcx
cmpq %rcx, %rdx
Expand All @@ -348,17 +351,17 @@ __memmove_evex_unaligned_erms:
addq %r8, %rdx

notl %ecx
movq %rdx, %r10
testl $(4096 - 32 * 8), %ecx
jz .Llarge_memcpy_4x

movq %rdx, %r10
shrq $4, %r10
cmp __x86_shared_non_temporal_threshold(%rip), %r10
shlq $4, %r11
cmp %r11, %rdx
jae .Llarge_memcpy_4x

andl $(4096 * 2 - 1), %edx

shrq $((12 + 1) - 4), %r10
shrq $(12 + 1), %r10

.p2align 4
.Lloop_large_memcpy_2x_outer:
Expand Down Expand Up @@ -421,7 +424,6 @@ __memmove_evex_unaligned_erms:

.p2align 4
.Llarge_memcpy_4x:
movq %rdx, %r10

andl $(4096 * 4 - 1), %edx

Expand Down
Loading
Loading