diff --git a/crypto/fipsmodule/sha/asm/sha256-armv4.pl b/crypto/fipsmodule/sha/asm/sha256-armv4.pl index c449a7c6c1..720221f844 100644 --- a/crypto/fipsmodule/sha/asm/sha256-armv4.pl +++ b/crypto/fipsmodule/sha/asm/sha256-armv4.pl @@ -482,6 +482,14 @@ () .arch armv7-a .fpu neon +.LK256_shortcut_neon: +@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. +#if defined(__thumb2__) +.word K256-(.LK256_add_neon+4) +#else +.word K256-(.LK256_add_neon+8) +#endif + .global sha256_block_data_order_neon .type sha256_block_data_order_neon,%function .align 5 @@ -491,7 +499,21 @@ () stmdb sp!,{r4-r12,lr} sub $H,sp,#16*4+16 - adr $Ktbl,K256 + + @ K256 is just at the boundary of being easily referenced by an ADR from + @ this function. In Arm mode, when building with __ARM_ARCH=6, it does + @ not fit. By moving code around, we could make it fit, but this is too + @ fragile. For simplicity, just load the offset from + @ .LK256_shortcut_neon. + @ + @ TODO(davidben): adrl would avoid a load, but clang-assembler does not + @ support it. We might be able to emulate it with a macro, but Android's + @ did not work when I tried it. + @ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm + ldr $Ktbl,.LK256_shortcut_neon +.LK256_add_neon: + add $Ktbl,pc,$Ktbl + bic $H,$H,#15 @ align for 128-bit stores mov $t2,sp mov sp,$H @ alloca @@ -617,12 +639,26 @@ () # define INST(a,b,c,d) .byte a,b,c,d # endif +.LK256_shortcut_armv8: +@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. +#if defined(__thumb2__) +.word K256-(.LK256_add_armv8+4) +#else +.word K256-(.LK256_add_armv8+8) +#endif + .type sha256_block_data_order_armv8,%function .align 5 sha256_block_data_order_armv8: .LARMv8: + @ K256 is too far to reference from one ADR command in Thumb mode. In + @ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte + @ boundary. For simplicity, just load the offset from .LK256_shortcut_armv8. + ldr $Ktbl,.LK256_shortcut_armv8 +.LK256_add_armv8: + add $Ktbl,pc,$Ktbl + vld1.32 {$ABCD,$EFGH},[$ctx] - sub $Ktbl,$Ktbl,#256+32 add $len,$inp,$len,lsl#6 @ len to point at the end of inp b .Loop_v8 diff --git a/generated-src/ios-arm/crypto/fipsmodule/sha256-armv4.S b/generated-src/ios-arm/crypto/fipsmodule/sha256-armv4.S index 76e2d8b981..4c72aaff8e 100644 --- a/generated-src/ios-arm/crypto/fipsmodule/sha256-armv4.S +++ b/generated-src/ios-arm/crypto/fipsmodule/sha256-armv4.S @@ -1895,6 +1895,14 @@ Lrounds_16_xx: +LK256_shortcut_neon: +@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. +#if defined(__thumb2__) +.word K256-(LK256_add_neon+4) +#else +.word K256-(LK256_add_neon+8) +#endif + .globl _sha256_block_data_order_neon .private_extern _sha256_block_data_order_neon #ifdef __thumb2__ @@ -1907,7 +1915,21 @@ LNEON: stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} sub r11,sp,#16*4+16 - adr r14,K256 + + @ K256 is just at the boundary of being easily referenced by an ADR from + @ this function. In Arm mode, when building with __ARM_ARCH=6, it does + @ not fit. By moving code around, we could make it fit, but this is too + @ fragile. For simplicity, just load the offset from + @ .LK256_shortcut_neon. + @ + @ TODO(davidben): adrl would avoid a load, but clang-assembler does not + @ support it. We might be able to emulate it with a macro, but Android's + @ did not work when I tried it. + @ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm + ldr r14,LK256_shortcut_neon +LK256_add_neon: + add r14,pc,r14 + bic r11,r11,#15 @ align for 128-bit stores mov r12,sp mov sp,r11 @ alloca @@ -2689,14 +2711,28 @@ L_00_48: # define INST(a,b,c,d) .byte a,b,c,d # endif +LK256_shortcut_armv8: +@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. +#if defined(__thumb2__) +.word K256-(LK256_add_armv8+4) +#else +.word K256-(LK256_add_armv8+8) +#endif + #ifdef __thumb2__ .thumb_func sha256_block_data_order_armv8 #endif .align 5 sha256_block_data_order_armv8: LARMv8: + @ K256 is too far to reference from one ADR command in Thumb mode. In + @ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte + @ boundary. For simplicity, just load the offset from .LK256_shortcut_armv8. + ldr r3,LK256_shortcut_armv8 +LK256_add_armv8: + add r3,pc,r3 + vld1.32 {q0,q1},[r0] - sub r3,r3,#256+32 add r2,r1,r2,lsl#6 @ len to point at the end of inp b Loop_v8 diff --git a/generated-src/linux-arm/crypto/fipsmodule/sha256-armv4.S b/generated-src/linux-arm/crypto/fipsmodule/sha256-armv4.S index 3c63a8ca6e..4fdcdc876c 100644 --- a/generated-src/linux-arm/crypto/fipsmodule/sha256-armv4.S +++ b/generated-src/linux-arm/crypto/fipsmodule/sha256-armv4.S @@ -1893,6 +1893,14 @@ sha256_block_data_order: .arch armv7-a .fpu neon +.LK256_shortcut_neon: +@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. +#if defined(__thumb2__) +.word K256-(.LK256_add_neon+4) +#else +.word K256-(.LK256_add_neon+8) +#endif + .globl sha256_block_data_order_neon .hidden sha256_block_data_order_neon .type sha256_block_data_order_neon,%function @@ -1903,7 +1911,21 @@ sha256_block_data_order_neon: stmdb sp!,{r4,r5,r6,r7,r8,r9,r10,r11,r12,lr} sub r11,sp,#16*4+16 - adr r14,K256 + + @ K256 is just at the boundary of being easily referenced by an ADR from + @ this function. In Arm mode, when building with __ARM_ARCH=6, it does + @ not fit. By moving code around, we could make it fit, but this is too + @ fragile. For simplicity, just load the offset from + @ .LK256_shortcut_neon. + @ + @ TODO(davidben): adrl would avoid a load, but clang-assembler does not + @ support it. We might be able to emulate it with a macro, but Android's + @ did not work when I tried it. + @ https://android.googlesource.com/platform/ndk/+/refs/heads/master/docs/ClangMigration.md#arm + ldr r14,.LK256_shortcut_neon +.LK256_add_neon: + add r14,pc,r14 + bic r11,r11,#15 @ align for 128-bit stores mov r12,sp mov sp,r11 @ alloca @@ -2685,12 +2707,26 @@ sha256_block_data_order_neon: # define INST(a,b,c,d) .byte a,b,c,d # endif +.LK256_shortcut_armv8: +@ PC is 8 bytes ahead in Arm mode and 4 bytes ahead in Thumb mode. +#if defined(__thumb2__) +.word K256-(.LK256_add_armv8+4) +#else +.word K256-(.LK256_add_armv8+8) +#endif + .type sha256_block_data_order_armv8,%function .align 5 sha256_block_data_order_armv8: .LARMv8: + @ K256 is too far to reference from one ADR command in Thumb mode. In + @ Arm mode, we could make it fit by aligning the ADR offset to a 64-byte + @ boundary. For simplicity, just load the offset from .LK256_shortcut_armv8. + ldr r3,.LK256_shortcut_armv8 +.LK256_add_armv8: + add r3,pc,r3 + vld1.32 {q0,q1},[r0] - sub r3,r3,#256+32 add r2,r1,r2,lsl#6 @ len to point at the end of inp b .Loop_v8