From 25ebaa9e4dc9d53a3bb833902fb30a2c302307e2 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Tue, 14 Nov 2023 13:47:29 -0800 Subject: [PATCH 1/9] Improve integer operation support in BOUNDER_RULE and BOUNDER_TAC In general, BOUNDER_RULE now directly handles operations over Z and N, assuming an outer real_of_int / real_of_num cast into R (this is also automated in the tactic form BOUNDER_TAC). In particular, this change can greatly improve bounds for terms involving integer or natural number division and remainder (DIV, div, MOD and rem) as well as cutoff subtraction over N. There is also now support for conditionals, though the condition is not used as extra context, simply being the basis for a case split. This update rolls in various trivial typographic fixes in comments. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/ccefa2a7109a9a784e5fea00da1817832dd28f73 --- arm/curve25519/edwards25519_scalarmulbase.S | 2 +- arm/curve25519/edwards25519_scalarmulbase_alt.S | 2 +- arm/curve25519/edwards25519_scalarmuldouble.S | 2 +- arm/curve25519/edwards25519_scalarmuldouble_alt.S | 2 +- arm/p384/Makefile | 2 +- arm/p521/Makefile | 2 +- x86_att/curve25519/edwards25519_scalarmulbase.S | 2 +- x86_att/curve25519/edwards25519_scalarmulbase_alt.S | 2 +- x86_att/curve25519/edwards25519_scalarmuldouble.S | 2 +- x86_att/curve25519/edwards25519_scalarmuldouble_alt.S | 2 +- 10 files changed, 10 insertions(+), 10 deletions(-) diff --git a/arm/curve25519/edwards25519_scalarmulbase.S b/arm/curve25519/edwards25519_scalarmulbase.S index 8c9d0f9193..89e98494ac 100644 --- a/arm/curve25519/edwards25519_scalarmulbase.S +++ b/arm/curve25519/edwards25519_scalarmulbase.S @@ -577,7 +577,7 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase): // (X,Y,Z,T), representing an affine point on the edwards25519 curve // (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z). // In comments B means the standard basepoint (x,4/5) = -// (0x216....f25d51a,0x0x6666..666658). +// (0x216....f25d51a,0x6666..666658). // // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. diff --git a/arm/curve25519/edwards25519_scalarmulbase_alt.S b/arm/curve25519/edwards25519_scalarmulbase_alt.S index 03e5598f2c..e89d58b378 100644 --- a/arm/curve25519/edwards25519_scalarmulbase_alt.S +++ b/arm/curve25519/edwards25519_scalarmulbase_alt.S @@ -419,7 +419,7 @@ S2N_BN_SYMBOL(edwards25519_scalarmulbase_alt): // (X,Y,Z,T), representing an affine point on the edwards25519 curve // (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z). // In comments B means the standard basepoint (x,4/5) = -// (0x216....f25d51a,0x0x6666..666658). +// (0x216....f25d51a,0x6666..666658). // // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. diff --git a/arm/curve25519/edwards25519_scalarmuldouble.S b/arm/curve25519/edwards25519_scalarmuldouble.S index 00ea37eaaf..d6fc9121f9 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble.S +++ b/arm/curve25519/edwards25519_scalarmuldouble.S @@ -1514,7 +1514,7 @@ edwards25519_scalarmuldouble_loop: // form amounts to swapping the first two fields and negating the third. // The negation does not always fully reduce even mod 2^256-38 in the zero // case, instead giving -0 = 2^256-38. But that is fine since the result is -// always fed to a multipliction inside the "pepadd" function below that +// always fed to a multiplication inside the "pepadd" function below that // handles any 256-bit input. cmp cf, xzr diff --git a/arm/curve25519/edwards25519_scalarmuldouble_alt.S b/arm/curve25519/edwards25519_scalarmuldouble_alt.S index ad05eae1fb..54cebef997 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/arm/curve25519/edwards25519_scalarmuldouble_alt.S @@ -1298,7 +1298,7 @@ edwards25519_scalarmuldouble_alt_loop: // form amounts to swapping the first two fields and negating the third. // The negation does not always fully reduce even mod 2^256-38 in the zero // case, instead giving -0 = 2^256-38. But that is fine since the result is -// always fed to a multipliction inside the "pepadd" function below that +// always fed to a multiplication inside the "pepadd" function below that // handles any 256-bit input. cmp cf, xzr diff --git a/arm/p384/Makefile b/arm/p384/Makefile index d3feb070c7..2390e53e44 100644 --- a/arm/p384/Makefile +++ b/arm/p384/Makefile @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 OR ISC ############################################################################# -# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise # use a cross-assembling version so that the code can still be assembled # and the proofs checked against the object files (though you won't be able # to run code without additional emulation infrastructure). The aarch64 diff --git a/arm/p521/Makefile b/arm/p521/Makefile index b8ad763c35..9121b81013 100644 --- a/arm/p521/Makefile +++ b/arm/p521/Makefile @@ -3,7 +3,7 @@ # SPDX-License-Identifier: Apache-2.0 OR ISC ############################################################################# -# If actually on an ARM8 machine, just use the GNU assmbler (as). Otherwise +# If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise # use a cross-assembling version so that the code can still be assembled # and the proofs checked against the object files (though you won't be able # to run code without additional emulation infrastructure). The aarch64 diff --git a/x86_att/curve25519/edwards25519_scalarmulbase.S b/x86_att/curve25519/edwards25519_scalarmulbase.S index c44e31724c..950b8dc649 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase.S @@ -431,7 +431,7 @@ edwards25519_scalarmulbase_standard: // (X,Y,Z,T), representing an affine point on the edwards25519 curve // (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z). // In comments B means the standard basepoint (x,4/5) = -// (0x216....f25d51a,0x0x6666..666658). +// (0x216....f25d51a,0x6666..666658). // // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. diff --git a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S index 00b91fe1aa..db7fa574b5 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S @@ -507,7 +507,7 @@ edwards25519_scalarmulbase_alt_standard: // (X,Y,Z,T), representing an affine point on the edwards25519 curve // (x,y) via x = X/Z, y = Y/Z and x * y = T/Z (so X * Y = T * Z). // In comments B means the standard basepoint (x,4/5) = -// (0x216....f25d51a,0x0x6666..666658). +// (0x216....f25d51a,0x6666..666658). // // Initialize accumulator "acc" to either 0 or 2^251 * B depending on // bit 251 of the (reduced) scalar. That leaves bits 0..250 to handle. diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble.S b/x86_att/curve25519/edwards25519_scalarmuldouble.S index 35fd7f4ffc..91a27e8cf3 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble.S @@ -1528,7 +1528,7 @@ edwards25519_scalarmuldouble_loop: // form amounts to swapping the first two fields and negating the third. // The negation does not always fully reduce even mod 2^256-38 in the zero // case, instead giving -0 = 2^256-38. But that is fine since the result is -// always fed to a multipliction inside the "pepadd" function below that +// always fed to a multiplication inside the "pepadd" function below that // handles any 256-bit input. movq cf, %rdi diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S index e17d10b47a..42380c036c 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S @@ -1645,7 +1645,7 @@ edwards25519_scalarmuldouble_alt_loop: // form amounts to swapping the first two fields and negating the third. // The negation does not always fully reduce even mod 2^256-38 in the zero // case, instead giving -0 = 2^256-38. But that is fine since the result is -// always fed to a multipliction inside the "pepadd" function below that +// always fed to a multiplication inside the "pepadd" function below that // handles any 256-bit input. movq cf, %rdi From 34438c96972e4f2a6c46a12cb468c2d3ad7d9b50 Mon Sep 17 00:00:00 2001 From: Torben Hansen <50673096+torben-hansen@users.noreply.github.com> Date: Wed, 15 Nov 2023 15:18:45 -0800 Subject: [PATCH 2/9] Avoid duplicate labels in ed25519 x86 implementation s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/e6ef86f7ebee4db5dccb351ff7ef7729de6dea42 --- x86_att/curve25519/edwards25519_scalarmuldouble.S | 8 ++++---- x86_att/curve25519/edwards25519_scalarmuldouble_alt.S | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble.S b/x86_att/curve25519/edwards25519_scalarmuldouble.S index 35fd7f4ffc..7e5fd2b41c 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble.S @@ -2072,8 +2072,8 @@ edwards25519_scalarmuldouble_loop: movq %rax, 0x78(%rsp) movq $0xa, 0x90(%rsp) movq $0x1, 0x98(%rsp) - jmp curve25519_x25519_midloop -curve25519_x25519_inverseloop: + jmp edwards25519_scalarmuldouble_midloop +edwards25519_scalarmuldouble_inverseloop: movq %r8, %r9 sarq $0x3f, %r9 xorq %r9, %r8 @@ -2364,7 +2364,7 @@ curve25519_x25519_inverseloop: shlq $0x3f, %rax addq %rax, %rsi movq %rsi, 0x78(%rsp) -curve25519_x25519_midloop: +edwards25519_scalarmuldouble_midloop: movq 0x98(%rsp), %rsi movq (%rsp), %rdx movq 0x20(%rsp), %rcx @@ -3265,7 +3265,7 @@ curve25519_x25519_midloop: leaq (%rax,%rdx), %r12 movq %rsi, 0x98(%rsp) decq 0x90(%rsp) - jne curve25519_x25519_inverseloop + jne edwards25519_scalarmuldouble_inverseloop movq (%rsp), %rax movq 0x20(%rsp), %rcx imulq %r8, %rax diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S index e17d10b47a..4cd5d1e63f 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S @@ -2189,8 +2189,8 @@ edwards25519_scalarmuldouble_alt_loop: movq %rax, 0x78(%rsp) movq $0xa, 0x90(%rsp) movq $0x1, 0x98(%rsp) - jmp curve25519_x25519_midloop -curve25519_x25519_inverseloop: + jmp edwards25519_scalarmuldouble_alt_midloop +edwards25519_scalarmuldouble_alt_inverseloop: movq %r8, %r9 sarq $0x3f, %r9 xorq %r9, %r8 @@ -2481,7 +2481,7 @@ curve25519_x25519_inverseloop: shlq $0x3f, %rax addq %rax, %rsi movq %rsi, 0x78(%rsp) -curve25519_x25519_midloop: +edwards25519_scalarmuldouble_alt_midloop: movq 0x98(%rsp), %rsi movq (%rsp), %rdx movq 0x20(%rsp), %rcx @@ -3382,7 +3382,7 @@ curve25519_x25519_midloop: leaq (%rax,%rdx), %r12 movq %rsi, 0x98(%rsp) decq 0x90(%rsp) - jne curve25519_x25519_inverseloop + jne edwards25519_scalarmuldouble_alt_inverseloop movq (%rsp), %rax movq 0x20(%rsp), %rcx imulq %r8, %rax From 9405fc5fc2122a4b862f4e1933161d37b51143ec Mon Sep 17 00:00:00 2001 From: Torben Hansen <50673096+torben-hansen@users.noreply.github.com> Date: Mon, 27 Nov 2023 10:50:22 -0800 Subject: [PATCH 3/9] Make parameter to ed25519 decode function const s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/d93670d3d7ddb23cd4059ac519038e7796cd3d45 --- arm/curve25519/edwards25519_decode.S | 2 +- arm/curve25519/edwards25519_decode_alt.S | 2 +- x86_att/curve25519/edwards25519_decode.S | 2 +- x86_att/curve25519/edwards25519_decode_alt.S | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arm/curve25519/edwards25519_decode.S b/arm/curve25519/edwards25519_decode.S index 9161768db7..653689be94 100644 --- a/arm/curve25519/edwards25519_decode.S +++ b/arm/curve25519/edwards25519_decode.S @@ -5,7 +5,7 @@ // Decode compressed 256-bit form of edwards25519 point // Input c[32] (bytes); output function return and z[8] // -// extern uint64_t edwards25519_decode(uint64_t z[static 8],uint8_t c[static 32]); +// extern uint64_t edwards25519_decode(uint64_t z[static 8], const uint8_t c[static 32]); // // This interprets the input byte string as a little-endian number // representing a point (x,y) on the edwards25519 curve, encoded as diff --git a/arm/curve25519/edwards25519_decode_alt.S b/arm/curve25519/edwards25519_decode_alt.S index c77a191744..a8e842f15a 100644 --- a/arm/curve25519/edwards25519_decode_alt.S +++ b/arm/curve25519/edwards25519_decode_alt.S @@ -5,7 +5,7 @@ // Decode compressed 256-bit form of edwards25519 point // Input c[32] (bytes); output function return and z[8] // -// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8],uint8_t c[static 32]); +// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8], const uint8_t c[static 32]); // // This interprets the input byte string as a little-endian number // representing a point (x,y) on the edwards25519 curve, encoded as diff --git a/x86_att/curve25519/edwards25519_decode.S b/x86_att/curve25519/edwards25519_decode.S index 05681925a3..24431ef564 100644 --- a/x86_att/curve25519/edwards25519_decode.S +++ b/x86_att/curve25519/edwards25519_decode.S @@ -5,7 +5,7 @@ // Decode compressed 256-bit form of edwards25519 point // Input c[32] (bytes); output function return and z[8] // -// extern uint64_t edwards25519_decode(uint64_t z[static 8],uint8_t c[static 32]); +// extern uint64_t edwards25519_decode(uint64_t z[static 8], const uint8_t c[static 32]); // // This interprets the input byte string as a little-endian number // representing a point (x,y) on the edwards25519 curve, encoded as diff --git a/x86_att/curve25519/edwards25519_decode_alt.S b/x86_att/curve25519/edwards25519_decode_alt.S index 570b2f9081..c7854380e1 100644 --- a/x86_att/curve25519/edwards25519_decode_alt.S +++ b/x86_att/curve25519/edwards25519_decode_alt.S @@ -5,7 +5,7 @@ // Decode compressed 256-bit form of edwards25519 point // Input c[32] (bytes); output function return and z[8] // -// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8],uint8_t c[static 32]); +// extern uint64_t edwards25519_decode_alt(uint64_t z[static 8], const uint8_t c[static 32]); // // This interprets the input byte string as a little-endian number // representing a point (x,y) on the edwards25519 curve, encoded as From 580c3171a9148f8e489c73a4c9f570ca64f48be0 Mon Sep 17 00:00:00 2001 From: jargh <78765052+jargh@users.noreply.github.com> Date: Tue, 23 Jan 2024 12:20:41 -0800 Subject: [PATCH 4/9] Allow MIT-0 license as well as Apache-2.0 and ISC (#104) * Allow MIT-0 license as well as Apache-2.0 and ISC * Add appropriate year range to MIT-0 license s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/48fb153e097894a90f84defe913fd1a572cb7900 --- arm/curve25519/bignum_mod_n25519.S | 2 +- arm/curve25519/bignum_neg_p25519.S | 2 +- arm/curve25519/curve25519_x25519.S | 2 +- arm/curve25519/curve25519_x25519_alt.S | 2 +- arm/curve25519/curve25519_x25519_byte.S | 2 +- arm/curve25519/curve25519_x25519_byte_alt.S | 2 +- arm/curve25519/curve25519_x25519base.S | 2 +- arm/curve25519/curve25519_x25519base_alt.S | 2 +- arm/curve25519/curve25519_x25519base_byte.S | 2 +- arm/curve25519/curve25519_x25519base_byte_alt.S | 2 +- arm/curve25519/edwards25519_decode.S | 2 +- arm/curve25519/edwards25519_decode_alt.S | 2 +- arm/curve25519/edwards25519_encode.S | 2 +- arm/curve25519/edwards25519_scalarmulbase.S | 2 +- arm/curve25519/edwards25519_scalarmulbase_alt.S | 2 +- arm/curve25519/edwards25519_scalarmuldouble.S | 2 +- arm/curve25519/edwards25519_scalarmuldouble_alt.S | 2 +- arm/fastmul/bignum_emontredc_8n.S | 2 +- arm/fastmul/bignum_kmul_16_32.S | 2 +- arm/fastmul/bignum_kmul_32_64.S | 2 +- arm/fastmul/bignum_ksqr_16_32.S | 2 +- arm/fastmul/bignum_ksqr_32_64.S | 2 +- arm/generic/bignum_ge.S | 2 +- arm/generic/bignum_mul.S | 2 +- arm/generic/bignum_optsub.S | 2 +- arm/generic/bignum_sqr.S | 2 +- arm/p384/Makefile | 2 +- arm/p384/bignum_add_p384.S | 2 +- arm/p384/bignum_bigendian_6.S | 2 +- arm/p384/bignum_cmul_p384.S | 2 +- arm/p384/bignum_deamont_p384.S | 2 +- arm/p384/bignum_demont_p384.S | 2 +- arm/p384/bignum_double_p384.S | 2 +- arm/p384/bignum_half_p384.S | 2 +- arm/p384/bignum_littleendian_6.S | 2 +- arm/p384/bignum_mod_n384.S | 2 +- arm/p384/bignum_mod_n384_6.S | 2 +- arm/p384/bignum_mod_p384.S | 2 +- arm/p384/bignum_mod_p384_6.S | 2 +- arm/p384/bignum_montmul_p384.S | 2 +- arm/p384/bignum_montmul_p384_alt.S | 2 +- arm/p384/bignum_montsqr_p384.S | 2 +- arm/p384/bignum_montsqr_p384_alt.S | 2 +- arm/p384/bignum_mux_6.S | 2 +- arm/p384/bignum_neg_p384.S | 2 +- arm/p384/bignum_nonzero_6.S | 2 +- arm/p384/bignum_optneg_p384.S | 2 +- arm/p384/bignum_sub_p384.S | 2 +- arm/p384/bignum_tomont_p384.S | 2 +- arm/p384/bignum_triple_p384.S | 2 +- arm/p384/p384_montjadd.S | 2 +- arm/p384/p384_montjdouble.S | 2 +- arm/p384/p384_montjmixadd.S | 2 +- arm/p521/Makefile | 2 +- arm/p521/bignum_add_p521.S | 2 +- arm/p521/bignum_cmul_p521.S | 2 +- arm/p521/bignum_deamont_p521.S | 2 +- arm/p521/bignum_demont_p521.S | 2 +- arm/p521/bignum_double_p521.S | 2 +- arm/p521/bignum_fromlebytes_p521.S | 2 +- arm/p521/bignum_half_p521.S | 2 +- arm/p521/bignum_mod_n521_9.S | 2 +- arm/p521/bignum_mod_p521_9.S | 2 +- arm/p521/bignum_montmul_p521.S | 2 +- arm/p521/bignum_montmul_p521_alt.S | 2 +- arm/p521/bignum_montsqr_p521.S | 2 +- arm/p521/bignum_montsqr_p521_alt.S | 2 +- arm/p521/bignum_mul_p521.S | 2 +- arm/p521/bignum_mul_p521_alt.S | 2 +- arm/p521/bignum_neg_p521.S | 2 +- arm/p521/bignum_optneg_p521.S | 2 +- arm/p521/bignum_sqr_p521.S | 2 +- arm/p521/bignum_sqr_p521_alt.S | 2 +- arm/p521/bignum_sub_p521.S | 2 +- arm/p521/bignum_tolebytes_p521.S | 2 +- arm/p521/bignum_tomont_p521.S | 2 +- arm/p521/bignum_triple_p521.S | 2 +- arm/p521/p521_jadd.S | 2 +- arm/p521/p521_jdouble.S | 2 +- arm/p521/p521_jmixadd.S | 2 +- x86_att/curve25519/bignum_mod_n25519.S | 2 +- x86_att/curve25519/bignum_neg_p25519.S | 2 +- x86_att/curve25519/curve25519_x25519.S | 2 +- x86_att/curve25519/curve25519_x25519_alt.S | 2 +- x86_att/curve25519/curve25519_x25519base.S | 2 +- x86_att/curve25519/curve25519_x25519base_alt.S | 2 +- x86_att/curve25519/edwards25519_decode.S | 2 +- x86_att/curve25519/edwards25519_decode_alt.S | 2 +- x86_att/curve25519/edwards25519_encode.S | 2 +- x86_att/curve25519/edwards25519_scalarmulbase.S | 2 +- x86_att/curve25519/edwards25519_scalarmulbase_alt.S | 2 +- x86_att/curve25519/edwards25519_scalarmuldouble.S | 2 +- x86_att/curve25519/edwards25519_scalarmuldouble_alt.S | 2 +- x86_att/p384/bignum_add_p384.S | 2 +- x86_att/p384/bignum_bigendian_6.S | 2 +- x86_att/p384/bignum_cmul_p384.S | 2 +- x86_att/p384/bignum_cmul_p384_alt.S | 2 +- x86_att/p384/bignum_deamont_p384.S | 2 +- x86_att/p384/bignum_deamont_p384_alt.S | 2 +- x86_att/p384/bignum_demont_p384.S | 2 +- x86_att/p384/bignum_demont_p384_alt.S | 2 +- x86_att/p384/bignum_double_p384.S | 2 +- x86_att/p384/bignum_half_p384.S | 2 +- x86_att/p384/bignum_littleendian_6.S | 2 +- x86_att/p384/bignum_mod_n384.S | 2 +- x86_att/p384/bignum_mod_n384_6.S | 2 +- x86_att/p384/bignum_mod_n384_alt.S | 2 +- x86_att/p384/bignum_mod_p384.S | 2 +- x86_att/p384/bignum_mod_p384_6.S | 2 +- x86_att/p384/bignum_mod_p384_alt.S | 2 +- x86_att/p384/bignum_montmul_p384.S | 2 +- x86_att/p384/bignum_montmul_p384_alt.S | 2 +- x86_att/p384/bignum_montsqr_p384.S | 2 +- x86_att/p384/bignum_montsqr_p384_alt.S | 2 +- x86_att/p384/bignum_mux_6.S | 2 +- x86_att/p384/bignum_neg_p384.S | 2 +- x86_att/p384/bignum_nonzero_6.S | 2 +- x86_att/p384/bignum_optneg_p384.S | 2 +- x86_att/p384/bignum_sub_p384.S | 2 +- x86_att/p384/bignum_tomont_p384.S | 2 +- x86_att/p384/bignum_tomont_p384_alt.S | 2 +- x86_att/p384/bignum_triple_p384.S | 2 +- x86_att/p384/bignum_triple_p384_alt.S | 2 +- x86_att/p384/p384_montjadd.S | 2 +- x86_att/p384/p384_montjdouble.S | 2 +- x86_att/p384/p384_montjmixadd.S | 2 +- x86_att/p521/bignum_add_p521.S | 2 +- x86_att/p521/bignum_cmul_p521.S | 2 +- x86_att/p521/bignum_cmul_p521_alt.S | 2 +- x86_att/p521/bignum_deamont_p521.S | 2 +- x86_att/p521/bignum_demont_p521.S | 2 +- x86_att/p521/bignum_double_p521.S | 2 +- x86_att/p521/bignum_fromlebytes_p521.S | 2 +- x86_att/p521/bignum_half_p521.S | 2 +- x86_att/p521/bignum_mod_n521_9.S | 2 +- x86_att/p521/bignum_mod_n521_9_alt.S | 2 +- x86_att/p521/bignum_mod_p521_9.S | 2 +- x86_att/p521/bignum_montmul_p521.S | 2 +- x86_att/p521/bignum_montmul_p521_alt.S | 2 +- x86_att/p521/bignum_montsqr_p521.S | 2 +- x86_att/p521/bignum_montsqr_p521_alt.S | 2 +- x86_att/p521/bignum_mul_p521.S | 2 +- x86_att/p521/bignum_mul_p521_alt.S | 2 +- x86_att/p521/bignum_neg_p521.S | 2 +- x86_att/p521/bignum_optneg_p521.S | 2 +- x86_att/p521/bignum_sqr_p521.S | 2 +- x86_att/p521/bignum_sqr_p521_alt.S | 2 +- x86_att/p521/bignum_sub_p521.S | 2 +- x86_att/p521/bignum_tolebytes_p521.S | 2 +- x86_att/p521/bignum_tomont_p521.S | 2 +- x86_att/p521/bignum_triple_p521.S | 2 +- x86_att/p521/bignum_triple_p521_alt.S | 2 +- x86_att/p521/p521_jadd.S | 2 +- x86_att/p521/p521_jdouble.S | 2 +- x86_att/p521/p521_jmixadd.S | 2 +- 155 files changed, 155 insertions(+), 155 deletions(-) diff --git a/arm/curve25519/bignum_mod_n25519.S b/arm/curve25519/bignum_mod_n25519.S index 5a256ed133..3f8a94c9bb 100644 --- a/arm/curve25519/bignum_mod_n25519.S +++ b/arm/curve25519/bignum_mod_n25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo basepoint order, z := x mod n_25519 diff --git a/arm/curve25519/bignum_neg_p25519.S b/arm/curve25519/bignum_neg_p25519.S index 8466df43c1..e3e85b4ecf 100644 --- a/arm/curve25519/bignum_neg_p25519.S +++ b/arm/curve25519/bignum_neg_p25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced diff --git a/arm/curve25519/curve25519_x25519.S b/arm/curve25519/curve25519_x25519.S index 7514dac33a..5aaaaa0f5a 100644 --- a/arm/curve25519/curve25519_x25519.S +++ b/arm/curve25519/curve25519_x25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 diff --git a/arm/curve25519/curve25519_x25519_alt.S b/arm/curve25519/curve25519_x25519_alt.S index 261b82c90a..82de375b14 100644 --- a/arm/curve25519/curve25519_x25519_alt.S +++ b/arm/curve25519/curve25519_x25519_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 diff --git a/arm/curve25519/curve25519_x25519_byte.S b/arm/curve25519/curve25519_x25519_byte.S index 7837118421..3e3c03371d 100644 --- a/arm/curve25519/curve25519_x25519_byte.S +++ b/arm/curve25519/curve25519_x25519_byte.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 (byte array arguments) diff --git a/arm/curve25519/curve25519_x25519_byte_alt.S b/arm/curve25519/curve25519_x25519_byte_alt.S index 6523822d2c..790cb2b030 100644 --- a/arm/curve25519/curve25519_x25519_byte_alt.S +++ b/arm/curve25519/curve25519_x25519_byte_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 (byte array arguments) diff --git a/arm/curve25519/curve25519_x25519base.S b/arm/curve25519/curve25519_x25519base.S index b9c3b8e34a..ef46f7b169 100644 --- a/arm/curve25519/curve25519_x25519base.S +++ b/arm/curve25519/curve25519_x25519base.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 diff --git a/arm/curve25519/curve25519_x25519base_alt.S b/arm/curve25519/curve25519_x25519base_alt.S index 22de69f4c3..702fe6e88a 100644 --- a/arm/curve25519/curve25519_x25519base_alt.S +++ b/arm/curve25519/curve25519_x25519base_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 diff --git a/arm/curve25519/curve25519_x25519base_byte.S b/arm/curve25519/curve25519_x25519base_byte.S index aecc693c66..635729cb77 100644 --- a/arm/curve25519/curve25519_x25519base_byte.S +++ b/arm/curve25519/curve25519_x25519base_byte.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 (byte array arguments) diff --git a/arm/curve25519/curve25519_x25519base_byte_alt.S b/arm/curve25519/curve25519_x25519base_byte_alt.S index 9c9dca518c..39b6bfd172 100644 --- a/arm/curve25519/curve25519_x25519base_byte_alt.S +++ b/arm/curve25519/curve25519_x25519base_byte_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 (byte array arguments) diff --git a/arm/curve25519/edwards25519_decode.S b/arm/curve25519/edwards25519_decode.S index 653689be94..f565df90fd 100644 --- a/arm/curve25519/edwards25519_decode.S +++ b/arm/curve25519/edwards25519_decode.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Decode compressed 256-bit form of edwards25519 point diff --git a/arm/curve25519/edwards25519_decode_alt.S b/arm/curve25519/edwards25519_decode_alt.S index a8e842f15a..befacd2ff0 100644 --- a/arm/curve25519/edwards25519_decode_alt.S +++ b/arm/curve25519/edwards25519_decode_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Decode compressed 256-bit form of edwards25519 point diff --git a/arm/curve25519/edwards25519_encode.S b/arm/curve25519/edwards25519_encode.S index 4cf301a227..c0f2e3fc9e 100644 --- a/arm/curve25519/edwards25519_encode.S +++ b/arm/curve25519/edwards25519_encode.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Encode edwards25519 point into compressed form as 256-bit number diff --git a/arm/curve25519/edwards25519_scalarmulbase.S b/arm/curve25519/edwards25519_scalarmulbase.S index 89e98494ac..e00aa7e278 100644 --- a/arm/curve25519/edwards25519_scalarmulbase.S +++ b/arm/curve25519/edwards25519_scalarmulbase.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Scalar multiplication for the edwards25519 standard basepoint diff --git a/arm/curve25519/edwards25519_scalarmulbase_alt.S b/arm/curve25519/edwards25519_scalarmulbase_alt.S index e89d58b378..2ffc7799ed 100644 --- a/arm/curve25519/edwards25519_scalarmulbase_alt.S +++ b/arm/curve25519/edwards25519_scalarmulbase_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Scalar multiplication for the edwards25519 standard basepoint diff --git a/arm/curve25519/edwards25519_scalarmuldouble.S b/arm/curve25519/edwards25519_scalarmuldouble.S index d6fc9121f9..d8c6e21c6e 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble.S +++ b/arm/curve25519/edwards25519_scalarmuldouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double scalar multiplication for edwards25519, fresh and base point diff --git a/arm/curve25519/edwards25519_scalarmuldouble_alt.S b/arm/curve25519/edwards25519_scalarmuldouble_alt.S index 54cebef997..9c3d6db2cb 100644 --- a/arm/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/arm/curve25519/edwards25519_scalarmuldouble_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double scalar multiplication for edwards25519, fresh and base point diff --git a/arm/fastmul/bignum_emontredc_8n.S b/arm/fastmul/bignum_emontredc_8n.S index 0876ddea8b..081f5de362 100644 --- a/arm/fastmul/bignum_emontredc_8n.S +++ b/arm/fastmul/bignum_emontredc_8n.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Extended Montgomery reduce in 8-digit blocks, results in input-output buffer diff --git a/arm/fastmul/bignum_kmul_16_32.S b/arm/fastmul/bignum_kmul_16_32.S index 2367b69891..e45dd487e1 100644 --- a/arm/fastmul/bignum_kmul_16_32.S +++ b/arm/fastmul/bignum_kmul_16_32.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply z := x * y diff --git a/arm/fastmul/bignum_kmul_32_64.S b/arm/fastmul/bignum_kmul_32_64.S index 467d298697..e45249462a 100644 --- a/arm/fastmul/bignum_kmul_32_64.S +++ b/arm/fastmul/bignum_kmul_32_64.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply z := x * y diff --git a/arm/fastmul/bignum_ksqr_16_32.S b/arm/fastmul/bignum_ksqr_16_32.S index bb62a9c0ca..7be2ac6c45 100644 --- a/arm/fastmul/bignum_ksqr_16_32.S +++ b/arm/fastmul/bignum_ksqr_16_32.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square, z := x^2 diff --git a/arm/fastmul/bignum_ksqr_32_64.S b/arm/fastmul/bignum_ksqr_32_64.S index fbd3c47bec..659e00a791 100644 --- a/arm/fastmul/bignum_ksqr_32_64.S +++ b/arm/fastmul/bignum_ksqr_32_64.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square, z := x^2 diff --git a/arm/generic/bignum_ge.S b/arm/generic/bignum_ge.S index a646b47d43..5ba0b8eda9 100644 --- a/arm/generic/bignum_ge.S +++ b/arm/generic/bignum_ge.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Compare bignums, x >= y diff --git a/arm/generic/bignum_mul.S b/arm/generic/bignum_mul.S index 1da4bf9516..f02665c36b 100644 --- a/arm/generic/bignum_mul.S +++ b/arm/generic/bignum_mul.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply z := x * y diff --git a/arm/generic/bignum_optsub.S b/arm/generic/bignum_optsub.S index 285536ef74..e696198fc4 100644 --- a/arm/generic/bignum_optsub.S +++ b/arm/generic/bignum_optsub.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Optionally subtract, z := x - y (if p nonzero) or z := x (if p zero) diff --git a/arm/generic/bignum_sqr.S b/arm/generic/bignum_sqr.S index 1a75dbddbb..2305cce102 100644 --- a/arm/generic/bignum_sqr.S +++ b/arm/generic/bignum_sqr.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square z := x^2 diff --git a/arm/p384/Makefile b/arm/p384/Makefile index 2390e53e44..564b9dd93c 100644 --- a/arm/p384/Makefile +++ b/arm/p384/Makefile @@ -1,6 +1,6 @@ ############################################################################# # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 OR ISC +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 ############################################################################# # If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise diff --git a/arm/p384/bignum_add_p384.S b/arm/p384/bignum_add_p384.S index 00c8e81d31..ad7f2c6b7b 100644 --- a/arm/p384/bignum_add_p384.S +++ b/arm/p384/bignum_add_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced diff --git a/arm/p384/bignum_bigendian_6.S b/arm/p384/bignum_bigendian_6.S index 664ae845dd..cb103d691c 100644 --- a/arm/p384/bignum_bigendian_6.S +++ b/arm/p384/bignum_bigendian_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 6-digit (384-bit) bignum to/from big-endian form diff --git a/arm/p384/bignum_cmul_p384.S b/arm/p384/bignum_cmul_p384.S index b9570c7998..74f648b4c5 100644 --- a/arm/p384/bignum_cmul_p384.S +++ b/arm/p384/bignum_cmul_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming diff --git a/arm/p384/bignum_deamont_p384.S b/arm/p384/bignum_deamont_p384.S index 91ea265a97..1f84a4becf 100644 --- a/arm/p384/bignum_deamont_p384.S +++ b/arm/p384/bignum_deamont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from almost-Montgomery form, z := (x / 2^384) mod p_384 diff --git a/arm/p384/bignum_demont_p384.S b/arm/p384/bignum_demont_p384.S index c0dd331d64..1b09517288 100644 --- a/arm/p384/bignum_demont_p384.S +++ b/arm/p384/bignum_demont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced diff --git a/arm/p384/bignum_double_p384.S b/arm/p384/bignum_double_p384.S index fce40a0ff1..07b1a57f20 100644 --- a/arm/p384/bignum_double_p384.S +++ b/arm/p384/bignum_double_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced diff --git a/arm/p384/bignum_half_p384.S b/arm/p384/bignum_half_p384.S index e3a7ff0e77..c023542b1b 100644 --- a/arm/p384/bignum_half_p384.S +++ b/arm/p384/bignum_half_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced diff --git a/arm/p384/bignum_littleendian_6.S b/arm/p384/bignum_littleendian_6.S index 66b0424a51..f325456298 100644 --- a/arm/p384/bignum_littleendian_6.S +++ b/arm/p384/bignum_littleendian_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 6-digit (384-bit) bignum to/from little-endian form diff --git a/arm/p384/bignum_mod_n384.S b/arm/p384/bignum_mod_n384.S index e8de84d4cb..a91bb2c5b5 100644 --- a/arm/p384/bignum_mod_n384.S +++ b/arm/p384/bignum_mod_n384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_384 diff --git a/arm/p384/bignum_mod_n384_6.S b/arm/p384/bignum_mod_n384_6.S index c382e642ca..e79ad3fe85 100644 --- a/arm/p384/bignum_mod_n384_6.S +++ b/arm/p384/bignum_mod_n384_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_384 diff --git a/arm/p384/bignum_mod_p384.S b/arm/p384/bignum_mod_p384.S index c2ab35526f..cf7f1d6bbb 100644 --- a/arm/p384/bignum_mod_p384.S +++ b/arm/p384/bignum_mod_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_384 diff --git a/arm/p384/bignum_mod_p384_6.S b/arm/p384/bignum_mod_p384_6.S index a1ac615b1a..959dc86239 100644 --- a/arm/p384/bignum_mod_p384_6.S +++ b/arm/p384/bignum_mod_p384_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_384 diff --git a/arm/p384/bignum_montmul_p384.S b/arm/p384/bignum_montmul_p384.S index 554081f39e..05c3d1786a 100644 --- a/arm/p384/bignum_montmul_p384.S +++ b/arm/p384/bignum_montmul_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^384) mod p_384 diff --git a/arm/p384/bignum_montmul_p384_alt.S b/arm/p384/bignum_montmul_p384_alt.S index 2bd28cfffa..a6464f07cc 100644 --- a/arm/p384/bignum_montmul_p384_alt.S +++ b/arm/p384/bignum_montmul_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^384) mod p_384 diff --git a/arm/p384/bignum_montsqr_p384.S b/arm/p384/bignum_montsqr_p384.S index 1067bf1a78..fd55c1bf02 100644 --- a/arm/p384/bignum_montsqr_p384.S +++ b/arm/p384/bignum_montsqr_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^384) mod p_384 diff --git a/arm/p384/bignum_montsqr_p384_alt.S b/arm/p384/bignum_montsqr_p384_alt.S index e4fe2f7f5b..f49830d21e 100644 --- a/arm/p384/bignum_montsqr_p384_alt.S +++ b/arm/p384/bignum_montsqr_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^384) mod p_384 diff --git a/arm/p384/bignum_mux_6.S b/arm/p384/bignum_mux_6.S index b4c966609f..21d1769949 100644 --- a/arm/p384/bignum_mux_6.S +++ b/arm/p384/bignum_mux_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) diff --git a/arm/p384/bignum_neg_p384.S b/arm/p384/bignum_neg_p384.S index 24bdbb1b23..186d50e881 100644 --- a/arm/p384/bignum_neg_p384.S +++ b/arm/p384/bignum_neg_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_384, z := (-x) mod p_384, assuming x reduced diff --git a/arm/p384/bignum_nonzero_6.S b/arm/p384/bignum_nonzero_6.S index ae003186b8..b98fe9d863 100644 --- a/arm/p384/bignum_nonzero_6.S +++ b/arm/p384/bignum_nonzero_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // 384-bit nonzeroness test, returning 1 if x is nonzero, 0 if x is zero diff --git a/arm/p384/bignum_optneg_p384.S b/arm/p384/bignum_optneg_p384.S index 7b5e704348..325fccbcf4 100644 --- a/arm/p384/bignum_optneg_p384.S +++ b/arm/p384/bignum_optneg_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or diff --git a/arm/p384/bignum_sub_p384.S b/arm/p384/bignum_sub_p384.S index bd7a9deeff..1e5085628b 100644 --- a/arm/p384/bignum_sub_p384.S +++ b/arm/p384/bignum_sub_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Subtract modulo p_384, z := (x - y) mod p_384 diff --git a/arm/p384/bignum_tomont_p384.S b/arm/p384/bignum_tomont_p384.S index efed55f8c0..c666f5e78f 100644 --- a/arm/p384/bignum_tomont_p384.S +++ b/arm/p384/bignum_tomont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert to Montgomery form z := (2^384 * x) mod p_384 diff --git a/arm/p384/bignum_triple_p384.S b/arm/p384/bignum_triple_p384.S index cc641a2eeb..d129b8712f 100644 --- a/arm/p384/bignum_triple_p384.S +++ b/arm/p384/bignum_triple_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_384, z := (3 * x) mod p_384 diff --git a/arm/p384/p384_montjadd.S b/arm/p384/p384_montjadd.S index 98f40b0a80..9c0e1ecb99 100644 --- a/arm/p384/p384_montjadd.S +++ b/arm/p384/p384_montjadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/arm/p384/p384_montjdouble.S b/arm/p384/p384_montjdouble.S index 5b4a609b59..7dfd9766f2 100644 --- a/arm/p384/p384_montjdouble.S +++ b/arm/p384/p384_montjdouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/arm/p384/p384_montjmixadd.S b/arm/p384/p384_montjmixadd.S index 0f5c24203f..1b0165ab8c 100644 --- a/arm/p384/p384_montjmixadd.S +++ b/arm/p384/p384_montjmixadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/arm/p521/Makefile b/arm/p521/Makefile index 9121b81013..ae0d4f8d70 100644 --- a/arm/p521/Makefile +++ b/arm/p521/Makefile @@ -1,6 +1,6 @@ ############################################################################# # Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -# SPDX-License-Identifier: Apache-2.0 OR ISC +# SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 ############################################################################# # If actually on an ARM8 machine, just use the GNU assembler (as). Otherwise diff --git a/arm/p521/bignum_add_p521.S b/arm/p521/bignum_add_p521.S index d9d59bbd48..248db96ef2 100644 --- a/arm/p521/bignum_add_p521.S +++ b/arm/p521/bignum_add_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced diff --git a/arm/p521/bignum_cmul_p521.S b/arm/p521/bignum_cmul_p521.S index 0b657b8b73..00f9cf0be5 100644 --- a/arm/p521/bignum_cmul_p521.S +++ b/arm/p521/bignum_cmul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming diff --git a/arm/p521/bignum_deamont_p521.S b/arm/p521/bignum_deamont_p521.S index 442e5d4048..83849147f8 100644 --- a/arm/p521/bignum_deamont_p521.S +++ b/arm/p521/bignum_deamont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^576) mod p_521 diff --git a/arm/p521/bignum_demont_p521.S b/arm/p521/bignum_demont_p521.S index d3004ec580..1b48113e01 100644 --- a/arm/p521/bignum_demont_p521.S +++ b/arm/p521/bignum_demont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_double_p521.S b/arm/p521/bignum_double_p521.S index 8d0e291120..ecfdcf2f74 100644 --- a/arm/p521/bignum_double_p521.S +++ b/arm/p521/bignum_double_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_fromlebytes_p521.S b/arm/p521/bignum_fromlebytes_p521.S index 7a87ed3338..fd0d8ca362 100644 --- a/arm/p521/bignum_fromlebytes_p521.S +++ b/arm/p521/bignum_fromlebytes_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert little-endian bytes to 9-digit 528-bit bignum diff --git a/arm/p521/bignum_half_p521.S b/arm/p521/bignum_half_p521.S index 1f8da155ba..757156b266 100644 --- a/arm/p521/bignum_half_p521.S +++ b/arm/p521/bignum_half_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_mod_n521_9.S b/arm/p521/bignum_mod_n521_9.S index 65bc4f08bb..d680e5f1db 100644 --- a/arm/p521/bignum_mod_n521_9.S +++ b/arm/p521/bignum_mod_n521_9.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_521 diff --git a/arm/p521/bignum_mod_p521_9.S b/arm/p521/bignum_mod_p521_9.S index 874e9df091..56385905ac 100644 --- a/arm/p521/bignum_mod_p521_9.S +++ b/arm/p521/bignum_mod_p521_9.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_521 diff --git a/arm/p521/bignum_montmul_p521.S b/arm/p521/bignum_montmul_p521.S index c0ac8cf926..e1ea8dc0c2 100644 --- a/arm/p521/bignum_montmul_p521.S +++ b/arm/p521/bignum_montmul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^576) mod p_521 diff --git a/arm/p521/bignum_montmul_p521_alt.S b/arm/p521/bignum_montmul_p521_alt.S index 6b0afeac1d..8c302ce1f8 100644 --- a/arm/p521/bignum_montmul_p521_alt.S +++ b/arm/p521/bignum_montmul_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^576) mod p_521 diff --git a/arm/p521/bignum_montsqr_p521.S b/arm/p521/bignum_montsqr_p521.S index 45e57a666e..2c8dbd789f 100644 --- a/arm/p521/bignum_montsqr_p521.S +++ b/arm/p521/bignum_montsqr_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^576) mod p_521 diff --git a/arm/p521/bignum_montsqr_p521_alt.S b/arm/p521/bignum_montsqr_p521_alt.S index 1ae774f0d3..1376cf8eb7 100644 --- a/arm/p521/bignum_montsqr_p521_alt.S +++ b/arm/p521/bignum_montsqr_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^576) mod p_521 diff --git a/arm/p521/bignum_mul_p521.S b/arm/p521/bignum_mul_p521.S index 12594faf9a..97859d6bbe 100644 --- a/arm/p521/bignum_mul_p521.S +++ b/arm/p521/bignum_mul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced diff --git a/arm/p521/bignum_mul_p521_alt.S b/arm/p521/bignum_mul_p521_alt.S index d0c2cdb0e6..ea39156aaa 100644 --- a/arm/p521/bignum_mul_p521_alt.S +++ b/arm/p521/bignum_mul_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced diff --git a/arm/p521/bignum_neg_p521.S b/arm/p521/bignum_neg_p521.S index cdf7a9641c..488f3660b0 100644 --- a/arm/p521/bignum_neg_p521.S +++ b/arm/p521/bignum_neg_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_521, z := (-x) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_optneg_p521.S b/arm/p521/bignum_optneg_p521.S index 74fac18e5a..8c5dfda4db 100644 --- a/arm/p521/bignum_optneg_p521.S +++ b/arm/p521/bignum_optneg_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or diff --git a/arm/p521/bignum_sqr_p521.S b/arm/p521/bignum_sqr_p521.S index 23f8a3b9b2..404665258c 100644 --- a/arm/p521/bignum_sqr_p521.S +++ b/arm/p521/bignum_sqr_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_sqr_p521_alt.S b/arm/p521/bignum_sqr_p521_alt.S index 7837b23a3d..439dd2e7e6 100644 --- a/arm/p521/bignum_sqr_p521_alt.S +++ b/arm/p521/bignum_sqr_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced diff --git a/arm/p521/bignum_sub_p521.S b/arm/p521/bignum_sub_p521.S index 4cc4e830b5..8ff430d500 100644 --- a/arm/p521/bignum_sub_p521.S +++ b/arm/p521/bignum_sub_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Subtract modulo p_521, z := (x - y) mod p_521 diff --git a/arm/p521/bignum_tolebytes_p521.S b/arm/p521/bignum_tolebytes_p521.S index 403f8fbd64..b1c4b3eaf1 100644 --- a/arm/p521/bignum_tolebytes_p521.S +++ b/arm/p521/bignum_tolebytes_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 9-digit 528-bit bignum to little-endian bytes diff --git a/arm/p521/bignum_tomont_p521.S b/arm/p521/bignum_tomont_p521.S index 833c07b847..c94cd12ca0 100644 --- a/arm/p521/bignum_tomont_p521.S +++ b/arm/p521/bignum_tomont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert to Montgomery form z := (2^576 * x) mod p_521 diff --git a/arm/p521/bignum_triple_p521.S b/arm/p521/bignum_triple_p521.S index 7ce5d00915..961df99351 100644 --- a/arm/p521/bignum_triple_p521.S +++ b/arm/p521/bignum_triple_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced diff --git a/arm/p521/p521_jadd.S b/arm/p521/p521_jadd.S index 928d7ea6cc..1d6b196c8c 100644 --- a/arm/p521/p521_jadd.S +++ b/arm/p521/p521_jadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point addition on NIST curve P-521 in Jacobian coordinates diff --git a/arm/p521/p521_jdouble.S b/arm/p521/p521_jdouble.S index 6794e4cd92..100f6d3e87 100644 --- a/arm/p521/p521_jdouble.S +++ b/arm/p521/p521_jdouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point doubling on NIST curve P-521 in Jacobian coordinates diff --git a/arm/p521/p521_jmixadd.S b/arm/p521/p521_jmixadd.S index cd27d24eb8..c9b62a9aa1 100644 --- a/arm/p521/p521_jmixadd.S +++ b/arm/p521/p521_jmixadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point mixed addition on NIST curve P-521 in Jacobian coordinates diff --git a/x86_att/curve25519/bignum_mod_n25519.S b/x86_att/curve25519/bignum_mod_n25519.S index c45d99b541..52f8bfdd57 100644 --- a/x86_att/curve25519/bignum_mod_n25519.S +++ b/x86_att/curve25519/bignum_mod_n25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo basepoint order, z := x mod n_25519 diff --git a/x86_att/curve25519/bignum_neg_p25519.S b/x86_att/curve25519/bignum_neg_p25519.S index 02d01b1241..5e66073baf 100644 --- a/x86_att/curve25519/bignum_neg_p25519.S +++ b/x86_att/curve25519/bignum_neg_p25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_25519, z := (-x) mod p_25519, assuming x reduced diff --git a/x86_att/curve25519/curve25519_x25519.S b/x86_att/curve25519/curve25519_x25519.S index b46c522b36..87e5e9cf62 100644 --- a/x86_att/curve25519/curve25519_x25519.S +++ b/x86_att/curve25519/curve25519_x25519.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 diff --git a/x86_att/curve25519/curve25519_x25519_alt.S b/x86_att/curve25519/curve25519_x25519_alt.S index dd644dbba9..4a63a55f11 100644 --- a/x86_att/curve25519/curve25519_x25519_alt.S +++ b/x86_att/curve25519/curve25519_x25519_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 diff --git a/x86_att/curve25519/curve25519_x25519base.S b/x86_att/curve25519/curve25519_x25519base.S index e450656861..dda3b1707b 100644 --- a/x86_att/curve25519/curve25519_x25519base.S +++ b/x86_att/curve25519/curve25519_x25519base.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 diff --git a/x86_att/curve25519/curve25519_x25519base_alt.S b/x86_att/curve25519/curve25519_x25519base_alt.S index b1275e2084..b6c82faba0 100644 --- a/x86_att/curve25519/curve25519_x25519base_alt.S +++ b/x86_att/curve25519/curve25519_x25519base_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // The x25519 function for curve25519 on base element 9 diff --git a/x86_att/curve25519/edwards25519_decode.S b/x86_att/curve25519/edwards25519_decode.S index 24431ef564..ae63e0dacb 100644 --- a/x86_att/curve25519/edwards25519_decode.S +++ b/x86_att/curve25519/edwards25519_decode.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Decode compressed 256-bit form of edwards25519 point diff --git a/x86_att/curve25519/edwards25519_decode_alt.S b/x86_att/curve25519/edwards25519_decode_alt.S index c7854380e1..8bfe721253 100644 --- a/x86_att/curve25519/edwards25519_decode_alt.S +++ b/x86_att/curve25519/edwards25519_decode_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Decode compressed 256-bit form of edwards25519 point diff --git a/x86_att/curve25519/edwards25519_encode.S b/x86_att/curve25519/edwards25519_encode.S index bdbaa47232..13b0102d09 100644 --- a/x86_att/curve25519/edwards25519_encode.S +++ b/x86_att/curve25519/edwards25519_encode.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Encode edwards25519 point into compressed form as 256-bit number diff --git a/x86_att/curve25519/edwards25519_scalarmulbase.S b/x86_att/curve25519/edwards25519_scalarmulbase.S index 950b8dc649..6b2a80c728 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Scalar multiplication for the edwards25519 standard basepoint diff --git a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S index db7fa574b5..4796e72189 100644 --- a/x86_att/curve25519/edwards25519_scalarmulbase_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmulbase_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Scalar multiplication for the edwards25519 standard basepoint diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble.S b/x86_att/curve25519/edwards25519_scalarmuldouble.S index eabdcd461b..993c420e05 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double scalar multiplication for edwards25519, fresh and base point diff --git a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S index b285d57ff5..e7c8f7a59d 100644 --- a/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S +++ b/x86_att/curve25519/edwards25519_scalarmuldouble_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double scalar multiplication for edwards25519, fresh and base point diff --git a/x86_att/p384/bignum_add_p384.S b/x86_att/p384/bignum_add_p384.S index b0a3c9c517..94293e4e70 100644 --- a/x86_att/p384/bignum_add_p384.S +++ b/x86_att/p384/bignum_add_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Add modulo p_384, z := (x + y) mod p_384, assuming x and y reduced diff --git a/x86_att/p384/bignum_bigendian_6.S b/x86_att/p384/bignum_bigendian_6.S index 7fa59c536e..0a23e35659 100644 --- a/x86_att/p384/bignum_bigendian_6.S +++ b/x86_att/p384/bignum_bigendian_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 6-digit (384-bit) bignum to/from big-endian form diff --git a/x86_att/p384/bignum_cmul_p384.S b/x86_att/p384/bignum_cmul_p384.S index 6632a9ae7e..76f6795087 100644 --- a/x86_att/p384/bignum_cmul_p384.S +++ b/x86_att/p384/bignum_cmul_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming diff --git a/x86_att/p384/bignum_cmul_p384_alt.S b/x86_att/p384/bignum_cmul_p384_alt.S index c91629cd30..2e21e64615 100644 --- a/x86_att/p384/bignum_cmul_p384_alt.S +++ b/x86_att/p384/bignum_cmul_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_384, z := (c * x) mod p_384, assuming diff --git a/x86_att/p384/bignum_deamont_p384.S b/x86_att/p384/bignum_deamont_p384.S index 6b7daea25e..9edb4ab610 100644 --- a/x86_att/p384/bignum_deamont_p384.S +++ b/x86_att/p384/bignum_deamont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from almost-Montgomery form, z := (x / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_deamont_p384_alt.S b/x86_att/p384/bignum_deamont_p384_alt.S index 918a104f63..c0e6096bdd 100644 --- a/x86_att/p384/bignum_deamont_p384_alt.S +++ b/x86_att/p384/bignum_deamont_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from almost-Montgomery form, z := (x / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_demont_p384.S b/x86_att/p384/bignum_demont_p384.S index 3dc1d734c4..36a5ef0078 100644 --- a/x86_att/p384/bignum_demont_p384.S +++ b/x86_att/p384/bignum_demont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced diff --git a/x86_att/p384/bignum_demont_p384_alt.S b/x86_att/p384/bignum_demont_p384_alt.S index d2dca9c4f2..adccd962e7 100644 --- a/x86_att/p384/bignum_demont_p384_alt.S +++ b/x86_att/p384/bignum_demont_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^384) mod p_384, assuming x reduced diff --git a/x86_att/p384/bignum_double_p384.S b/x86_att/p384/bignum_double_p384.S index c06b218889..7e0c35dab3 100644 --- a/x86_att/p384/bignum_double_p384.S +++ b/x86_att/p384/bignum_double_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double modulo p_384, z := (2 * x) mod p_384, assuming x reduced diff --git a/x86_att/p384/bignum_half_p384.S b/x86_att/p384/bignum_half_p384.S index 51afea03bb..a3e3954173 100644 --- a/x86_att/p384/bignum_half_p384.S +++ b/x86_att/p384/bignum_half_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Halve modulo p_384, z := (x / 2) mod p_384, assuming x reduced diff --git a/x86_att/p384/bignum_littleendian_6.S b/x86_att/p384/bignum_littleendian_6.S index a0eef1f00f..fe5744a86e 100644 --- a/x86_att/p384/bignum_littleendian_6.S +++ b/x86_att/p384/bignum_littleendian_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 6-digit (384-bit) bignum to/from little-endian form diff --git a/x86_att/p384/bignum_mod_n384.S b/x86_att/p384/bignum_mod_n384.S index 963873f72e..169a136ea3 100644 --- a/x86_att/p384/bignum_mod_n384.S +++ b/x86_att/p384/bignum_mod_n384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_384 diff --git a/x86_att/p384/bignum_mod_n384_6.S b/x86_att/p384/bignum_mod_n384_6.S index 273bce8b33..6b68c2a444 100644 --- a/x86_att/p384/bignum_mod_n384_6.S +++ b/x86_att/p384/bignum_mod_n384_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_384 diff --git a/x86_att/p384/bignum_mod_n384_alt.S b/x86_att/p384/bignum_mod_n384_alt.S index ffd9c9d1b9..92282a83a7 100644 --- a/x86_att/p384/bignum_mod_n384_alt.S +++ b/x86_att/p384/bignum_mod_n384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_384 diff --git a/x86_att/p384/bignum_mod_p384.S b/x86_att/p384/bignum_mod_p384.S index 10414fea41..c9caf41c83 100644 --- a/x86_att/p384/bignum_mod_p384.S +++ b/x86_att/p384/bignum_mod_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_384 diff --git a/x86_att/p384/bignum_mod_p384_6.S b/x86_att/p384/bignum_mod_p384_6.S index 08381a6c1e..7196a76f31 100644 --- a/x86_att/p384/bignum_mod_p384_6.S +++ b/x86_att/p384/bignum_mod_p384_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_384 diff --git a/x86_att/p384/bignum_mod_p384_alt.S b/x86_att/p384/bignum_mod_p384_alt.S index 689f1d340c..79da7842a6 100644 --- a/x86_att/p384/bignum_mod_p384_alt.S +++ b/x86_att/p384/bignum_mod_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_384 diff --git a/x86_att/p384/bignum_montmul_p384.S b/x86_att/p384/bignum_montmul_p384.S index 718991aac1..105efac610 100644 --- a/x86_att/p384/bignum_montmul_p384.S +++ b/x86_att/p384/bignum_montmul_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_montmul_p384_alt.S b/x86_att/p384/bignum_montmul_p384_alt.S index 3da172840e..5a8b4905d9 100644 --- a/x86_att/p384/bignum_montmul_p384_alt.S +++ b/x86_att/p384/bignum_montmul_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_montsqr_p384.S b/x86_att/p384/bignum_montsqr_p384.S index f8b4230b7e..0d0b36013a 100644 --- a/x86_att/p384/bignum_montsqr_p384.S +++ b/x86_att/p384/bignum_montsqr_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_montsqr_p384_alt.S b/x86_att/p384/bignum_montsqr_p384_alt.S index e04807766c..061ef6181d 100644 --- a/x86_att/p384/bignum_montsqr_p384_alt.S +++ b/x86_att/p384/bignum_montsqr_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^384) mod p_384 diff --git a/x86_att/p384/bignum_mux_6.S b/x86_att/p384/bignum_mux_6.S index 5277428379..cb4c2ca503 100644 --- a/x86_att/p384/bignum_mux_6.S +++ b/x86_att/p384/bignum_mux_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // 384-bit multiplex/select z := x (if p nonzero) or z := y (if p zero) diff --git a/x86_att/p384/bignum_neg_p384.S b/x86_att/p384/bignum_neg_p384.S index 51b0f41bb1..746c01286a 100644 --- a/x86_att/p384/bignum_neg_p384.S +++ b/x86_att/p384/bignum_neg_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_384, z := (-x) mod p_384, assuming x reduced diff --git a/x86_att/p384/bignum_nonzero_6.S b/x86_att/p384/bignum_nonzero_6.S index 8e17207d4a..7fdb6bab06 100644 --- a/x86_att/p384/bignum_nonzero_6.S +++ b/x86_att/p384/bignum_nonzero_6.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // 384-bit nonzeroness test, returning 1 if x is nonzero, 0 if x is zero diff --git a/x86_att/p384/bignum_optneg_p384.S b/x86_att/p384/bignum_optneg_p384.S index cee7be2f3c..0a8b247e5d 100644 --- a/x86_att/p384/bignum_optneg_p384.S +++ b/x86_att/p384/bignum_optneg_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Optionally negate modulo p_384, z := (-x) mod p_384 (if p nonzero) or diff --git a/x86_att/p384/bignum_sub_p384.S b/x86_att/p384/bignum_sub_p384.S index 8d4ae986a2..5914f4ae9c 100644 --- a/x86_att/p384/bignum_sub_p384.S +++ b/x86_att/p384/bignum_sub_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Subtract modulo p_384, z := (x - y) mod p_384 diff --git a/x86_att/p384/bignum_tomont_p384.S b/x86_att/p384/bignum_tomont_p384.S index 70463c73a6..66503a2ec4 100644 --- a/x86_att/p384/bignum_tomont_p384.S +++ b/x86_att/p384/bignum_tomont_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert to Montgomery form z := (2^384 * x) mod p_384 diff --git a/x86_att/p384/bignum_tomont_p384_alt.S b/x86_att/p384/bignum_tomont_p384_alt.S index 75ba90d7f7..725713d341 100644 --- a/x86_att/p384/bignum_tomont_p384_alt.S +++ b/x86_att/p384/bignum_tomont_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert to Montgomery form z := (2^384 * x) mod p_384 diff --git a/x86_att/p384/bignum_triple_p384.S b/x86_att/p384/bignum_triple_p384.S index 2d3ae66bf7..52b70f6bea 100644 --- a/x86_att/p384/bignum_triple_p384.S +++ b/x86_att/p384/bignum_triple_p384.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_384, z := (3 * x) mod p_384 diff --git a/x86_att/p384/bignum_triple_p384_alt.S b/x86_att/p384/bignum_triple_p384_alt.S index 91efffbe1e..bdbf7e8f6d 100644 --- a/x86_att/p384/bignum_triple_p384_alt.S +++ b/x86_att/p384/bignum_triple_p384_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_384, z := (3 * x) mod p_384 diff --git a/x86_att/p384/p384_montjadd.S b/x86_att/p384/p384_montjadd.S index 52b86b2063..27b58bfc14 100644 --- a/x86_att/p384/p384_montjadd.S +++ b/x86_att/p384/p384_montjadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point addition on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/x86_att/p384/p384_montjdouble.S b/x86_att/p384/p384_montjdouble.S index 80e0b6cc88..b51d24f931 100644 --- a/x86_att/p384/p384_montjdouble.S +++ b/x86_att/p384/p384_montjdouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point doubling on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/x86_att/p384/p384_montjmixadd.S b/x86_att/p384/p384_montjmixadd.S index 8a8c17c1a0..0d456464b9 100644 --- a/x86_att/p384/p384_montjmixadd.S +++ b/x86_att/p384/p384_montjmixadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point mixed addition on NIST curve P-384 in Montgomery-Jacobian coordinates diff --git a/x86_att/p521/bignum_add_p521.S b/x86_att/p521/bignum_add_p521.S index 849a740971..b046828d45 100644 --- a/x86_att/p521/bignum_add_p521.S +++ b/x86_att/p521/bignum_add_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Add modulo p_521, z := (x + y) mod p_521, assuming x and y reduced diff --git a/x86_att/p521/bignum_cmul_p521.S b/x86_att/p521/bignum_cmul_p521.S index 7898293c6a..fbfc3063fd 100644 --- a/x86_att/p521/bignum_cmul_p521.S +++ b/x86_att/p521/bignum_cmul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming diff --git a/x86_att/p521/bignum_cmul_p521_alt.S b/x86_att/p521/bignum_cmul_p521_alt.S index c5f79a8189..fd6986f232 100644 --- a/x86_att/p521/bignum_cmul_p521_alt.S +++ b/x86_att/p521/bignum_cmul_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply by a single word modulo p_521, z := (c * x) mod p_521, assuming diff --git a/x86_att/p521/bignum_deamont_p521.S b/x86_att/p521/bignum_deamont_p521.S index d916da1f95..099c0e33fc 100644 --- a/x86_att/p521/bignum_deamont_p521.S +++ b/x86_att/p521/bignum_deamont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^576) mod p_521 diff --git a/x86_att/p521/bignum_demont_p521.S b/x86_att/p521/bignum_demont_p521.S index 182360406a..ef83448b15 100644 --- a/x86_att/p521/bignum_demont_p521.S +++ b/x86_att/p521/bignum_demont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert from Montgomery form z := (x / 2^576) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_double_p521.S b/x86_att/p521/bignum_double_p521.S index f3923d82ce..9322ec0b1a 100644 --- a/x86_att/p521/bignum_double_p521.S +++ b/x86_att/p521/bignum_double_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Double modulo p_521, z := (2 * x) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_fromlebytes_p521.S b/x86_att/p521/bignum_fromlebytes_p521.S index a5c9f491d9..6a80dce3c2 100644 --- a/x86_att/p521/bignum_fromlebytes_p521.S +++ b/x86_att/p521/bignum_fromlebytes_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert little-endian bytes to 9-digit 528-bit bignum diff --git a/x86_att/p521/bignum_half_p521.S b/x86_att/p521/bignum_half_p521.S index 9023beb032..ee8b91a325 100644 --- a/x86_att/p521/bignum_half_p521.S +++ b/x86_att/p521/bignum_half_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Halve modulo p_521, z := (x / 2) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_mod_n521_9.S b/x86_att/p521/bignum_mod_n521_9.S index 9dcc73d15f..c7e33f88fd 100644 --- a/x86_att/p521/bignum_mod_n521_9.S +++ b/x86_att/p521/bignum_mod_n521_9.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_521 diff --git a/x86_att/p521/bignum_mod_n521_9_alt.S b/x86_att/p521/bignum_mod_n521_9_alt.S index 026a97e451..aeb314691a 100644 --- a/x86_att/p521/bignum_mod_n521_9_alt.S +++ b/x86_att/p521/bignum_mod_n521_9_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo group order, z := x mod n_521 diff --git a/x86_att/p521/bignum_mod_p521_9.S b/x86_att/p521/bignum_mod_p521_9.S index 0f2e4267f4..0d67aa3ee2 100644 --- a/x86_att/p521/bignum_mod_p521_9.S +++ b/x86_att/p521/bignum_mod_p521_9.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Reduce modulo field characteristic, z := x mod p_521 diff --git a/x86_att/p521/bignum_montmul_p521.S b/x86_att/p521/bignum_montmul_p521.S index 3ee202d458..21d777a655 100644 --- a/x86_att/p521/bignum_montmul_p521.S +++ b/x86_att/p521/bignum_montmul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^576) mod p_521 diff --git a/x86_att/p521/bignum_montmul_p521_alt.S b/x86_att/p521/bignum_montmul_p521_alt.S index dcef877ffd..b3d0d7c2c6 100644 --- a/x86_att/p521/bignum_montmul_p521_alt.S +++ b/x86_att/p521/bignum_montmul_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery multiply, z := (x * y / 2^576) mod p_521 diff --git a/x86_att/p521/bignum_montsqr_p521.S b/x86_att/p521/bignum_montsqr_p521.S index 91cb9c318d..ede53c627c 100644 --- a/x86_att/p521/bignum_montsqr_p521.S +++ b/x86_att/p521/bignum_montsqr_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^576) mod p_521 diff --git a/x86_att/p521/bignum_montsqr_p521_alt.S b/x86_att/p521/bignum_montsqr_p521_alt.S index ad071a453b..dccdc33ef5 100644 --- a/x86_att/p521/bignum_montsqr_p521_alt.S +++ b/x86_att/p521/bignum_montsqr_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Montgomery square, z := (x^2 / 2^576) mod p_521 diff --git a/x86_att/p521/bignum_mul_p521.S b/x86_att/p521/bignum_mul_p521.S index 25073f9daf..f96e8417ab 100644 --- a/x86_att/p521/bignum_mul_p521.S +++ b/x86_att/p521/bignum_mul_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced diff --git a/x86_att/p521/bignum_mul_p521_alt.S b/x86_att/p521/bignum_mul_p521_alt.S index 3224a86634..f87546928a 100644 --- a/x86_att/p521/bignum_mul_p521_alt.S +++ b/x86_att/p521/bignum_mul_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Multiply modulo p_521, z := (x * y) mod p_521, assuming x and y reduced diff --git a/x86_att/p521/bignum_neg_p521.S b/x86_att/p521/bignum_neg_p521.S index 484c1fca56..9a130b0b30 100644 --- a/x86_att/p521/bignum_neg_p521.S +++ b/x86_att/p521/bignum_neg_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Negate modulo p_521, z := (-x) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_optneg_p521.S b/x86_att/p521/bignum_optneg_p521.S index d2434adb4c..8f4c740b6b 100644 --- a/x86_att/p521/bignum_optneg_p521.S +++ b/x86_att/p521/bignum_optneg_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Optionally negate modulo p_521, z := (-x) mod p_521 (if p nonzero) or diff --git a/x86_att/p521/bignum_sqr_p521.S b/x86_att/p521/bignum_sqr_p521.S index b9a718cf9b..4b4748f106 100644 --- a/x86_att/p521/bignum_sqr_p521.S +++ b/x86_att/p521/bignum_sqr_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_sqr_p521_alt.S b/x86_att/p521/bignum_sqr_p521_alt.S index 58f496e3f0..475d3d3c81 100644 --- a/x86_att/p521/bignum_sqr_p521_alt.S +++ b/x86_att/p521/bignum_sqr_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Square modulo p_521, z := (x^2) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_sub_p521.S b/x86_att/p521/bignum_sub_p521.S index 99e0d96cd1..03db019833 100644 --- a/x86_att/p521/bignum_sub_p521.S +++ b/x86_att/p521/bignum_sub_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Subtract modulo p_521, z := (x - y) mod p_521 diff --git a/x86_att/p521/bignum_tolebytes_p521.S b/x86_att/p521/bignum_tolebytes_p521.S index c5ea2ed539..7f89172569 100644 --- a/x86_att/p521/bignum_tolebytes_p521.S +++ b/x86_att/p521/bignum_tolebytes_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert 9-digit 528-bit bignum to little-endian bytes diff --git a/x86_att/p521/bignum_tomont_p521.S b/x86_att/p521/bignum_tomont_p521.S index a97beaccb1..39983c24ba 100644 --- a/x86_att/p521/bignum_tomont_p521.S +++ b/x86_att/p521/bignum_tomont_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Convert to Montgomery form z := (2^576 * x) mod p_521 diff --git a/x86_att/p521/bignum_triple_p521.S b/x86_att/p521/bignum_triple_p521.S index 6703a9cb22..264481ef18 100644 --- a/x86_att/p521/bignum_triple_p521.S +++ b/x86_att/p521/bignum_triple_p521.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced diff --git a/x86_att/p521/bignum_triple_p521_alt.S b/x86_att/p521/bignum_triple_p521_alt.S index 4598d9db87..ecd0798778 100644 --- a/x86_att/p521/bignum_triple_p521_alt.S +++ b/x86_att/p521/bignum_triple_p521_alt.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Triple modulo p_521, z := (3 * x) mod p_521, assuming x reduced diff --git a/x86_att/p521/p521_jadd.S b/x86_att/p521/p521_jadd.S index 256ba845c4..807a7c5472 100644 --- a/x86_att/p521/p521_jadd.S +++ b/x86_att/p521/p521_jadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point addition on NIST curve P-521 in Jacobian coordinates diff --git a/x86_att/p521/p521_jdouble.S b/x86_att/p521/p521_jdouble.S index fd2a57bbc9..22ccbebd43 100644 --- a/x86_att/p521/p521_jdouble.S +++ b/x86_att/p521/p521_jdouble.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point doubling on NIST curve P-521 in Jacobian coordinates diff --git a/x86_att/p521/p521_jmixadd.S b/x86_att/p521/p521_jmixadd.S index 7054905371..702b63f560 100644 --- a/x86_att/p521/p521_jmixadd.S +++ b/x86_att/p521/p521_jmixadd.S @@ -1,5 +1,5 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. -// SPDX-License-Identifier: Apache-2.0 OR ISC +// SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 // ---------------------------------------------------------------------------- // Point mixed addition on NIST curve P-521 in Jacobian coordinates From 4cd44bc760a4fdf86b2b7ff31a7494ae3b783e78 Mon Sep 17 00:00:00 2001 From: John Harrison Date: Thu, 8 Feb 2024 16:00:10 -0800 Subject: [PATCH 5/9] Switch non-alt ARM X25519 to unsaturated code following Lenngren This completely changes the implementation of ARM curve25519_x25519 and curve25519_x25519_byte (not the _alt forms, which remain faster on their target microarchitectures) to a base-25.5 unsaturated version with interleaved integer and SIMD operations, the inner loop closely following Emil Lenngren's implementation described in the paper https://github.com/Emill/X25519-AArch64/blob/master/X25519_AArch64.pdf and available here: https://github.com/Emill/X25519-AArch64 A version of this code was generated by SLOTHY from the reorganized implementation by Abdulrahman, Becker, Kannwischer and Klein here: https://github.com/slothy-optimizer/slothy/blob/main/paper/clean/neon/X25519-AArch64-simple.s as described in the associated paper https://eprint.iacr.org/2022/1303.pdf with some additional annotations for use in the formal proof. The final modular inverse computation reverts to the usual saturated representation and s2n-bignum's divstep-based inverse function. s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/fc0b9bf7fb558ea49718317eb0623184d60b6fd6 --- arm/curve25519/curve25519_x25519.S | 2312 +++++++++++++-------- arm/curve25519/curve25519_x25519_byte.S | 2440 ++++++++++++++--------- 2 files changed, 3039 insertions(+), 1713 deletions(-) diff --git a/arm/curve25519/curve25519_x25519.S b/arm/curve25519/curve25519_x25519.S index 5aaaaa0f5a..28dd2f696a 100644 --- a/arm/curve25519/curve25519_x25519.S +++ b/arm/curve25519/curve25519_x25519.S @@ -1,6 +1,18 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +// ********************************************************************** +// This code is substantially derived from Emil Lenngren's implementation +// +// https://github.com/Emill/X25519-AArch64/blob/master/X25519_AArch64.pdf +// https://github.com/Emill/X25519-AArch64 +// +// and the SLOTHY-based re-engineering of that code by Hanno Becker: +// +// https://eprint.iacr.org/2022/1303.pdf +// https://github.com/slothy-optimizer/slothy/tree/main/paper +// ********************************************************************** + // ---------------------------------------------------------------------------- // The x25519 function for curve25519 // Inputs scalar[4], point[4]; output res[4] @@ -26,833 +38,1309 @@ .text .balign 4 -// Size of individual field elements - -#define NUMSIZE 32 - -// Stable homes for the input result argument during the whole body -// and other variables that are only needed prior to the modular inverse. - -#define res x23 -#define i x20 -#define swap x21 - -// Pointers to result x coord to be written - -#define resx res, #0 - -// Pointer-offset pairs for temporaries on stack with some aliasing. - -#define scalar sp, #(0*NUMSIZE) +// Pointer-offset pairs for temporaries on stack -#define pointx sp, #(1*NUMSIZE) +#define scalar sp, #0 +#define pointx sp, #32 +#define mask1 sp, #72 +#define mask2 sp, #80 +#define tmpa sp, #88 +#define tmpb sp, #128 +#define xn sp, #128 +#define zn sp, #160 -#define zm sp, #(2*NUMSIZE) -#define sm sp, #(2*NUMSIZE) -#define dpro sp, #(2*NUMSIZE) - -#define sn sp, #(3*NUMSIZE) - -#define dm sp, #(4*NUMSIZE) - -#define zn sp, #(5*NUMSIZE) -#define dn sp, #(5*NUMSIZE) -#define e sp, #(5*NUMSIZE) - -#define dmsn sp, #(6*NUMSIZE) -#define p sp, #(6*NUMSIZE) - -#define xm sp, #(7*NUMSIZE) -#define dnsm sp, #(7*NUMSIZE) -#define spro sp, #(7*NUMSIZE) - -#define d sp, #(8*NUMSIZE) - -#define xn sp, #(9*NUMSIZE) -#define s sp, #(9*NUMSIZE) +#define res sp, #192 +#define i sp, #200 +#define swap sp, #208 // Total size to reserve on the stack -#define NSPACE (10*NUMSIZE) - -// Macro wrapping up the basic field operation bignum_mul_p25519, only -// trivially different from a pure function call to that subroutine. - -#define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umaddl x5, w5, w0, x5; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - mov x3, #0x13; \ - tst x10, #0x8000000000000000; \ - csel x3, x3, xzr, pl; \ - subs x7, x7, x3; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - and x10, x10, #0x7fffffffffffffff; \ - stp x7, x8, [P0]; \ - stp x9, x10, [P0+16] - -// A version of multiplication that only guarantees output < 2 * p_25519. -// This basically skips the +1 and final correction in quotient estimation. - -#define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umull x5, w5, w0; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - stp x7, x8, [P0]; \ - stp x9, x10, [P0+16] - -// Squaring just giving a result < 2 * p_25519, which is done by -// basically skipping the +1 in the quotient estimate and the final -// optional correction. - -#define sqr_4(P0,P1) \ - ldp x10, x11, [P1]; \ - ldp x12, x13, [P1+16]; \ - umull x2, w10, w10; \ - lsr x14, x10, #32; \ - umull x3, w14, w14; \ - umull x14, w10, w14; \ - adds x2, x2, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x3, x3, x14; \ - umull x4, w11, w11; \ - lsr x14, x11, #32; \ - umull x5, w14, w14; \ - umull x14, w11, w14; \ - mul x15, x10, x11; \ - umulh x16, x10, x11; \ - adds x4, x4, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x5, x5, x14; \ - adds x15, x15, x15; \ - adcs x16, x16, x16; \ - adc x5, x5, xzr; \ - adds x3, x3, x15; \ - adcs x4, x4, x16; \ - adc x5, x5, xzr; \ - umull x6, w12, w12; \ - lsr x14, x12, #32; \ - umull x7, w14, w14; \ - umull x14, w12, w14; \ - adds x6, x6, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x7, x7, x14; \ - umull x8, w13, w13; \ - lsr x14, x13, #32; \ - umull x9, w14, w14; \ - umull x14, w13, w14; \ - mul x15, x12, x13; \ - umulh x16, x12, x13; \ - adds x8, x8, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x9, x9, x14; \ - adds x15, x15, x15; \ - adcs x16, x16, x16; \ - adc x9, x9, xzr; \ - adds x7, x7, x15; \ - adcs x8, x8, x16; \ - adc x9, x9, xzr; \ - subs x10, x10, x12; \ - sbcs x11, x11, x13; \ - csetm x16, cc; \ - eor x10, x10, x16; \ - subs x10, x10, x16; \ - eor x11, x11, x16; \ - sbc x11, x11, x16; \ - adds x6, x6, x4; \ - adcs x7, x7, x5; \ - adcs x8, x8, xzr; \ - adc x9, x9, xzr; \ - umull x12, w10, w10; \ - lsr x5, x10, #32; \ - umull x13, w5, w5; \ - umull x5, w10, w5; \ - adds x12, x12, x5, lsl #33; \ - lsr x5, x5, #31; \ - adc x13, x13, x5; \ - umull x15, w11, w11; \ - lsr x5, x11, #32; \ - umull x14, w5, w5; \ - umull x5, w11, w5; \ - mul x4, x10, x11; \ - umulh x16, x10, x11; \ - adds x15, x15, x5, lsl #33; \ - lsr x5, x5, #31; \ - adc x14, x14, x5; \ - adds x4, x4, x4; \ - adcs x16, x16, x16; \ - adc x14, x14, xzr; \ - adds x13, x13, x4; \ - adcs x15, x15, x16; \ - adc x14, x14, xzr; \ - adds x4, x2, x6; \ - adcs x5, x3, x7; \ - adcs x6, x6, x8; \ - adcs x7, x7, x9; \ - csetm x16, cc; \ - subs x4, x4, x12; \ - sbcs x5, x5, x13; \ - sbcs x6, x6, x15; \ - sbcs x7, x7, x14; \ - adcs x8, x8, x16; \ - adc x9, x9, x16; \ - mov x10, #0x26; \ - umull x12, w6, w10; \ - add x12, x12, w2, uxtw; \ - lsr x2, x2, #32; \ - lsr x6, x6, #32; \ - umaddl x6, w6, w10, x2; \ - mov x2, x12; \ - umull x12, w7, w10; \ - add x12, x12, w3, uxtw; \ - lsr x3, x3, #32; \ - lsr x7, x7, #32; \ - umaddl x7, w7, w10, x3; \ - mov x3, x12; \ - umull x12, w8, w10; \ - add x12, x12, w4, uxtw; \ - lsr x4, x4, #32; \ - lsr x8, x8, #32; \ - umaddl x8, w8, w10, x4; \ - mov x4, x12; \ - umull x12, w9, w10; \ - add x12, x12, w5, uxtw; \ - lsr x5, x5, #32; \ - lsr x9, x9, #32; \ - umaddl x9, w9, w10, x5; \ - mov x5, x12; \ - lsr x13, x9, #31; \ - mov x11, #0x13; \ - umull x11, w11, w13; \ - add x2, x2, x11; \ - adds x2, x2, x6, lsl #32; \ - extr x10, x7, x6, #32; \ - adcs x3, x3, x10; \ - extr x10, x8, x7, #32; \ - adcs x4, x4, x10; \ - extr x10, x9, x8, #32; \ - lsl x11, x13, #63; \ - eor x5, x5, x11; \ - adc x5, x5, x10; \ - stp x2, x3, [P0]; \ - stp x4, x5, [P0+16] - -// Modular addition with double modulus 2 * p_25519 = 2^256 - 38. -// This only ensures that the result fits in 4 digits, not that it is reduced -// even w.r.t. double modulus. The result is always correct modulo provided -// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided -// at least one of them is reduced double modulo. - -#define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ - stp x5, x6, [P0+16] - -// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 - -#define sub_twice4(p0,p1,p2) \ - ldp x5, x6, [p1]; \ - ldp x4, x3, [p2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [p1+16]; \ - ldp x4, x3, [p2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [p0]; \ - stp x7, x8, [p0+16] - -// Combined z = c * x + y with reduction only < 2 * p_25519 -// where c is initially in the X1 register. It is assumed -// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a -// high mul in the final part. - -#define cmadd_4(p0,p2,p3) \ - ldp x7, x8, [p2]; \ - ldp x9, x10, [p2+16]; \ - mul x3, x1, x7; \ - mul x4, x1, x8; \ - mul x5, x1, x9; \ - mul x6, x1, x10; \ - umulh x7, x1, x7; \ - umulh x8, x1, x8; \ - umulh x9, x1, x9; \ - umulh x10, x1, x10; \ - adds x4, x4, x7; \ - adcs x5, x5, x8; \ - adcs x6, x6, x9; \ - adc x10, x10, xzr; \ - ldp x7, x8, [p3]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x7, x8, [p3+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - adc x10, x10, xzr; \ - cmn x6, x6; \ - bic x6, x6, #0x8000000000000000; \ - adc x8, x10, x10; \ - mov x9, #19; \ - mul x7, x8, x9; \ - adds x3, x3, x7; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [p0]; \ - stp x5, x6, [p0+16] - -// Multiplex: z := if NZ then x else y - -#define mux_4(p0,p1,p2) \ - ldp x0, x1, [p1]; \ - ldp x2, x3, [p2]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ - stp x0, x1, [p0]; \ - ldp x0, x1, [p1+16]; \ - ldp x2, x3, [p2+16]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ - stp x0, x1, [p0+16] +#define NSPACE 224 +#define regsave sp, #NSPACE S2N_BN_SYMBOL(curve25519_x25519): -// Save regs and make room for temporaries - - stp x19, x20, [sp, -16]! - stp x21, x22, [sp, -16]! - stp x23, x24, [sp, -16]! - sub sp, sp, #NSPACE +// Save registers and make additional room NSPACE for temporaries. +// We only need to save the low 64-bits of the Q8...Q15 registers +// according to the ABI, so we use a save of the D8...D15 forms. + + sub sp, sp, #NSPACE+160 + stp d8, d9, [regsave+0] + stp d10, d11, [regsave+16] + stp d12, d13, [regsave+32] + stp d14, d15, [regsave+48] + stp x19, x20, [regsave+64] + stp x21, x22, [regsave+80] + stp x23, x24, [regsave+96] + stp x25, x26, [regsave+112] + stp x27, x28, [regsave+128] + stp x29, x30, [regsave+144] // Move the output pointer to a stable place - mov res, x0 + str x0, [res] -// Copy the inputs to the local variables with minimal mangling: -// -// - The scalar is in principle turned into 01xxx...xxx000 but -// in the structure below the special handling of these bits is -// explicit in the main computation; the scalar is just copied. -// -// - The point x coord is reduced mod 2^255 by masking off the -// top bit. In the main loop we only need reduction < 2 * p_25519. +// Copy the scalar to the corresponding local variable while +// mangling it. In principle it becomes 01xxx...xxx000 where +// the xxx are the corresponding bits of the original input +// scalar. We actually don't bother forcing the MSB to zero, +// but rather start the main loop below at 254 instead of 255. ldp x10, x11, [x1] + bic x10, x10, #7 stp x10, x11, [scalar] ldp x12, x13, [x1, #16] + orr x13, x13, #0x4000000000000000 stp x12, x13, [scalar+16] - ldp x10, x11, [x2] - stp x10, x11, [pointx] - ldp x12, x13, [x2, #16] - and x13, x13, #0x7fffffffffffffff +// Discard the MSB of the point X coordinate (this is in +// accordance with the RFC, mod 2^255, *not* 2^255-19). +// Then recode it into the unsaturated base 25.5 form. + + ldp x0, x1, [x2] + ldp x2, x3, [x2, #16] + + lsr x12, x0, #51 + lsr x17, x2, #51 + orr x12, x12, x1, lsl #13 + orr x17, x17, x3, lsl #13 + ubfx x8, x3, #12, #26 + ubfx x9, x3, #38, #25 + ubfx x11, x0, #26, #25 + ubfx x13, x1, #13, #25 + lsr x14, x1, #38 + ubfx x16, x2, #25, #26 + and x10, x0, #0x3ffffff + and x12, x12, #0x3ffffff + and x15, x2, #0x1ffffff + and x17, x17, #0x1ffffff + orr x10, x10, x11, lsl #32 + orr x11, x12, x13, lsl #32 + orr x12, x14, x15, lsl #32 + orr x13, x16, x17, lsl #32 + orr x14, x8, x9, lsl #32 + + stp x10, x11, [pointx+0] stp x12, x13, [pointx+16] + str x14, [pointx+32] + +// Initialize (X2,Z2) = (1,0), the identity (projective point at infinity) + + mov x1, #1 + mov v0.d[0], x1 + mov v2.d[0], xzr + mov v4.d[0], xzr + mov v6.d[0], xzr + mov v8.d[0], xzr + + mov v1.d[0], xzr + mov v3.d[0], xzr + mov v5.d[0], xzr + mov v7.d[0], xzr + mov v9.d[0], xzr + +// Initialize (X3,Z3) = (X,1), projective representation of X + + mov v10.d[0], x10 + mov v12.d[0], x11 + mov v14.d[0], x12 + mov v16.d[0], x13 + mov v18.d[0], x14 + + mov v11.d[0], x1 + mov v13.d[0], xzr + mov v15.d[0], xzr + mov v17.d[0], xzr + mov v19.d[0], xzr + +// Set up some constants used repeatedly in the main loop: +// +// Q31 = 0x1300000013 (two 32-bit copies of 19) +// Q30 = 0x3ffffff0000000003ffffff (two 64-bit copies of 2^26-1) +// Q29 = mask1 = (0x07ffffc,0x07fffffe) +// Q28 = mask2 = (0x07ffffb4,0x07fffffe) -// Initialize with explicit doubling in order to handle set bit 254. -// Set swap = 1 and (xm,zm) = (x,1) then double as (xn,zn) = 2 * (x,1). -// We use the fact that the point x coordinate is still in registers. -// Since zm = 1 we could do the doubling with an operation count of -// 2 * S + M instead of 2 * S + 2 * M, but it doesn't seem worth -// the slight complication arising from a different linear combination. - - mov swap, #1 - stp x10, x11, [xm] - stp x12, x13, [xm+16] - stp swap, xzr, [zm] - stp xzr, xzr, [zm+16] - - sub_twice4(d,xm,zm) - add_twice4(s,xm,zm) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - -// The main loop over unmodified bits from i = 253, ..., i = 3 (inclusive). -// This is a classic Montgomery ladder, with the main coordinates only -// reduced mod 2 * p_25519, some intermediate results even more loosely. + mov w0, #19 + add x0, x0, x0, lsl #32 + mov v31.d[0], x0 + mov v31.d[1], xzr - mov i, #253 + mov x0, #(1<<26)-1 + mov v30.d[0], x0 + mov v30.d[1], x0 -curve25519_x25519_scalarloop: + mov x0, #0x07fffffe07fffffe + sub x1, x0, #0xfe-0xb4 + sub x0, x0, #2 -// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn + stp x0, x1, [mask1] + ldp d29, d28, [mask1] - sub_twice4(dm,xm,zm) - add_twice4(sn,xn,zn) - sub_twice4(dn,xn,zn) - add_twice4(sm,xm,zm) +// The main loop over (modified) bits from i = 254, ..., i = 0 (inclusive); +// we explicitly skip bit 255 because it should be forced to zero initially. +// This is a classic Montgomery ladder using a "swap" variable. +// It's assumed x0 = i at the start of the loop, but that is volatile and +// needs to be reloaded from memory at the end of the loop. -// ADDING: dmsn = dm * sn -// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) + str xzr, [swap] + mov x0, #254 + str x0, [i] - mul_4(dmsn,sn,dm) +curve25519_x25519_scalarloop: - lsr x0, i, #6 - ldr x2, [sp, x0, lsl #3] // Exploiting scalar = sp exactly - lsr x2, x2, i + lsr x1, x0, #6 + ldr x2, [sp, x1, lsl #3] // Exploiting scalar = sp exactly + lsr x2, x2, x0 and x2, x2, #1 - cmp swap, x2 - mov swap, x2 - - mux_4(d,dm,dn) - mux_4(s,sm,sn) - -// ADDING: dnsm = sm * dn - - mul_4(dnsm,sm,dn) - -// DOUBLING: d = (xt - zt)^2 - - sqr_4(d,d) - -// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 -// DOUBLING: s = (xt + zt)^2 - - sub_twice4(dpro,dmsn,dnsm) - sqr_4(s,s) - add_twice4(spro,dmsn,dnsm) - sqr_4(dpro,dpro) - -// DOUBLING: p = 4 * xt * zt = s - d - - sub_twice4(p,s,d) - -// ADDING: xm' = (dmsn + dnsm)^2 - - sqr_4(xm,spro) - -// DOUBLING: e = 121666 * p + d - - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - -// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d - - mul_4(xn,s,d) - -// ADDING: zm' = x * (dmsn - dnsm)^2 - - mul_4(zm,dpro,pointx) - -// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) -// = p * (d + 121666 * p) - - mul_4(zn,p,e) - -// Loop down as far as 3 (inclusive) - - sub i, i, #1 - cmp i, #3 + ldr x0, [swap] + cmp x0, x2 + str x2, [swap] + +// The following inner loop code is derived closely following Lenngren's +// implementation available at "https://github.com/Emill/X25519-AArch64". +// In particular, the basic dataflow and the organization between integer +// and SIMD units is identical, with only a few minor changes to some +// individual instructions (for miscellaneous reasons). The scheduling +// was redone from scratch by SLOTHY starting from Hanno Becker's +// un-interleaved form and using the same scripts as in Becker et al's +// paper. +// +// The intermediate value annotations were added to provide data that +// is used in the formal proof, indicating which lines assign specific +// digits of the various intermediate results (mainly of field +// operations, sometimes other transformations). The names used for +// the intermediate results are similar but not identical to those in +// the abstract Algorithm 1 description in Lenngren's paper. Almost +// all equations are to be interpreted as field operations, i.e. as +// arithmetic modulo 2^255-19, not simple numeric equalities. +// +// b = x2 - z2 +// d = x3 - z3 +// a = x2 + z2 +// c = x3 + z3 +// f = if flip then c else a +// g = if flip then d else b +// aa = f^2 +// bb = g^2 +// bbalt = bb (change of representation) +// e = aa - bb +// bce = bbalt + 121666 * e +// z4 = bce * e +// bc = b * c +// ad = a * d +// t1 = ad + bc +// t2 = ad - bc +// x5 = t1^2 +// t3 = t2^2 +// x4 = aa * bb +// z5 = x * t3 +// +// Then the main variables are updated for the next iteration as +// +// (x2',z2') = (x4,z4) +// (x3',z3') = (x5,z5) + + add v22.2S, v2.2S, v3.2S // ubignum_of_qreglist 1 // INTERMEDIATE a + sub v21.2S, v28.2S, v1.2S + add v25.2S, v0.2S, v1.2S // ubignum_of_qreglist 0 // INTERMEDIATE a + sub v24.2S, v29.2S, v3.2S + add v3.2S, v18.2S, v19.2S // ubignum_of_qreglist 4 // INTERMEDIATE c + add v0.2S, v0.2S, v21.2S // ubignum_of_qreglist 0 // INTERMEDIATE b + sub v20.2S, v29.2S, v15.2S + sub v1.2S, v29.2S, v5.2S + sub v26.2S, v28.2S, v11.2S + sub v21.2S, v29.2S, v19.2S + add v19.2S, v10.2S, v11.2S // ubignum_of_qreglist 0 // INTERMEDIATE c + add v11.2S, v14.2S, v20.2S // ubignum_of_qreglist 2 // INTERMEDIATE d + add v21.2S, v18.2S, v21.2S // ubignum_of_qreglist 4 // INTERMEDIATE d + sub v20.2S, v29.2S, v17.2S + add v18.2S, v2.2S, v24.2S // ubignum_of_qreglist 1 // INTERMEDIATE b + add v14.2S, v14.2S, v15.2S // ubignum_of_qreglist 2 // INTERMEDIATE c + add v15.2S, v16.2S, v17.2S // ubignum_of_qreglist 3 // INTERMEDIATE c + add v2.2S, v16.2S, v20.2S // ubignum_of_qreglist 3 // INTERMEDIATE d + add v24.2S, v12.2S, v13.2S // ubignum_of_qreglist 1 // INTERMEDIATE c + add v26.2S, v10.2S, v26.2S // ubignum_of_qreglist 0 // INTERMEDIATE d + sub v10.2S, v29.2S, v13.2S + sub v13.2S, v29.2S, v7.2S + add v23.2S, v6.2S, v7.2S // ubignum_of_qreglist 3 // INTERMEDIATE a + sub v7.2S, v29.2S, v9.2S + add v27.2S, v12.2S, v10.2S // ubignum_of_qreglist 1 // INTERMEDIATE d + fcsel d20, d22, d24, eq // ubignum_of_qreglist 1 // INTERMEDIATE f + add v28.2S, v4.2S, v5.2S // ubignum_of_qreglist 2 // INTERMEDIATE a + fcsel d12, d23, d15, eq // ubignum_of_qreglist 3 // INTERMEDIATE f + add v7.2S, v8.2S, v7.2S // ubignum_of_qreglist 4 // INTERMEDIATE b + fcsel d16, d25, d19, eq // ubignum_of_qreglist 0 // INTERMEDIATE f + mov x0, v20.d[0] + fcsel d5, d28, d14, eq // ubignum_of_qreglist 2 // INTERMEDIATE f + mov x21, v12.d[0] + fcsel d29, d7, d21, eq // ubignum_of_qreglist 4 // INTERMEDIATE g + mov x5, v16.d[0] + lsr x26, x0, #32 + add x29, x21, x21 + umull x15, w5, w29 + add v13.2S, v6.2S, v13.2S // ubignum_of_qreglist 3 // INTERMEDIATE b + add x12, x26, x26 + mov x30, v5.d[0] + fcsel d10, d18, d27, eq // ubignum_of_qreglist 1 // INTERMEDIATE g + lsr x11, x5, #32 + lsr x10, x30, #32 + trn2 v20.2S, v21.2S, v3.2S + add v9.2S, v8.2S, v9.2S // ubignum_of_qreglist 4 // INTERMEDIATE a + add x14, x11, x11 + trn2 v6.2S, v2.2S, v15.2S + trn1 v12.2S, v25.2S, v0.2S + add v1.2S, v4.2S, v1.2S // ubignum_of_qreglist 2 // INTERMEDIATE b + trn1 v16.2S, v23.2S, v13.2S + fcsel d8, d13, d2, eq // ubignum_of_qreglist 3 // INTERMEDIATE g + trn2 v17.2S, v27.2S, v24.2S + str d29, [tmpb+32] + add x17, x10, x10 + trn2 v4.2S, v28.2S, v1.2S + trn1 v5.2S, v28.2S, v1.2S + trn1 v28.2S, v2.2S, v15.2S + trn1 v2.2S, v22.2S, v18.2S + fcsel d29, d0, d26, eq // ubignum_of_qreglist 0 // INTERMEDIATE g + trn2 v15.2S, v22.2S, v18.2S + umull v22.2D, v12.2S, v20.2S + umull x22, w30, w17 + stp d29, d10, [tmpb+0] + trn2 v10.2S, v23.2S, v13.2S + trn2 v23.2S, v11.2S, v14.2S + trn1 v13.2S, v27.2S, v24.2S + fcsel d27, d1, d11, eq // ubignum_of_qreglist 2 // INTERMEDIATE g + trn1 v14.2S, v11.2S, v14.2S + umlal v22.2D, v2.2S, v6.2S + umull x25, w30, w30 + umlal v22.2D, v5.2S, v23.2S + add x3, x30, x30 + umlal v22.2D, v16.2S, v17.2S + add w30, w21, w21, lsl #1; + stp d27, d8, [tmpb+16] + add w30, w30, w21, lsl #4 + trn1 v11.2S, v26.2S, v19.2S + trn2 v8.2S, v26.2S, v19.2S + trn2 v19.2S, v25.2S, v0.2S + mul v29.2S, v20.2S, v31.2S + ldr x20, [tmpb+24] + umull v25.2D, v19.2S, v6.2S + add x1, x0, x0 + umull v27.2D, v19.2S, v23.2S + umull x9, w5, w1 + umull v0.2D, v12.2S, v23.2S + lsr x24, x20, #32 + mul v20.2S, v23.2S, v31.2S + lsr x16, x21, #32 + umlal v25.2D, v15.2S, v23.2S + umaddl x13, w11, w14, x9 + umlal v25.2D, v4.2S, v17.2S + umaddl x9, w14, w17, x15 + umull v24.2D, v12.2S, v6.2S + add w2, w16, w16, lsl #1; + fcsel d26, d9, d3, eq // ubignum_of_qreglist 4 // INTERMEDIATE f + add w2, w2, w16, lsl #4 + trn1 v18.2S, v21.2S, v3.2S + umull v3.2D, v19.2S, v29.2S + umull x28, w5, w3 + mul v1.2S, v6.2S, v31.2S + umull x8, w5, w5 + umlal v24.2D, v2.2S, v23.2S + umaddl x13, w21, w30, x13 + mul v23.2S, v17.2S, v31.2S + umaddl x27, w14, w12, x28 + trn2 v6.2S, v9.2S, v7.2S + mov x6, v26.d[0] + umlal v3.2D, v15.2S, v1.2S + add x16, x16, x16 + umlal v3.2D, v4.2S, v20.2S + lsr x4, x6, #32 + umlal v3.2D, v10.2S, v23.2S + add x7, x6, x6 + umull v26.2D, v19.2S, v8.2S + add x23, x4, x4 + umaddl x28, w5, w23, x22 + trn1 v7.2S, v9.2S, v7.2S + umlal v27.2D, v15.2S, v17.2S + add w15, w4, w4, lsl #1; + umlal v27.2D, v4.2S, v8.2S + add w15, w15, w4, lsl #4 + add w22, w10, w10, lsl #1; + umlal v24.2D, v5.2S, v17.2S + add w22, w22, w10, lsl #4 + umaddl x10, w11, w7, x28 + umlal v25.2D, v10.2S, v8.2S + umull x21, w5, w16 + umlal v25.2D, v6.2S, v29.2S + umaddl x23, w15, w23, x25 + umlal v27.2D, v10.2S, v29.2S + umull x19, w5, w12 + umlal v27.2D, v6.2S, v1.2S + umaddl x25, w11, w29, x21 + umlal v0.2D, v2.2S, v17.2S + umaddl x28, w0, w3, x9 + shl v21.2D, v25.2D, #1 + umaddl x4, w11, w1, x19 + umaddl x21, w2, w29, x4 + mul v25.2S, v8.2S, v31.2S + umlal v24.2D, v16.2S, v8.2S + umaddl x19, w0, w17, x25 + umlal v24.2D, v7.2S, v29.2S + umull x25, w5, w17 + umlal v24.2D, v19.2S, v28.2S + umaddl x4, w0, w16, x10 + umull v9.2D, v12.2S, v8.2S + umaddl x23, w5, w7, x23 + umlal v21.2D, v12.2S, v18.2S + add w10, w6, w6, lsl #1; + shl v27.2D, v27.2D, #1 + add w10, w10, w6, lsl #4 + umaddl x28, w26, w12, x28 + umlal v26.2D, v15.2S, v29.2S + umaddl x9, w14, w16, x23 + umlal v9.2D, v2.2S, v29.2S + umaddl x22, w22, w17, x8 + umlal v21.2D, v2.2S, v28.2S + umaddl x28, w6, w10, x28 + umaddl x27, w0, w0, x27 + add x8, x14, x14 + umlal v0.2D, v5.2S, v8.2S + umull x5, w5, w14 + umlal v9.2D, v5.2S, v1.2S + umaddl x14, w0, w29, x9 + umlal v26.2D, v4.2S, v1.2S + umaddl x6, w2, w16, x27 + umlal v22.2D, v7.2S, v8.2S + umaddl x5, w30, w17, x5 + umaddl x5, w2, w3, x5 + add x23, x17, x17 + umlal v27.2D, v12.2S, v28.2S + umaddl x13, w2, w23, x13 + umlal v26.2D, v10.2S, v20.2S + add x9, x12, x12 + umlal v9.2D, v16.2S, v20.2S + umaddl x27, w10, w29, x6 + umlal v0.2D, v16.2S, v29.2S + umaddl x6, w11, w3, x25 + umlal v22.2D, v19.2S, v18.2S + umaddl x19, w26, w3, x19 + mul v18.2S, v18.2S, v31.2S + umaddl x23, w15, w23, x27 + umlal v3.2D, v6.2S, v25.2S + umaddl x0, w0, w12, x6 + umlal v0.2D, v7.2S, v1.2S + add x11, x16, x16 + umlal v9.2D, v7.2S, v23.2S + umaddl x6, w12, w17, x14 + umlal v9.2D, v19.2S, v11.2S + umaddl x25, w26, w29, x4 + umlal v9.2D, v15.2S, v18.2S + umaddl x14, w10, w3, x13 + umull v25.2D, v12.2S, v17.2S + umaddl x27, w10, w16, x0 + umlal v26.2D, v6.2S, v23.2S + add x0, x25, x6, lsr #26 + mul v23.2S, v28.2S, v31.2S + umaddl x12, w10, w12, x5 + shl v3.2D, v3.2D, #1 + add x16, x22, x0, lsr #25 + umlal v21.2D, v5.2S, v14.2S + bic x22, x0, #0x1ffffff + umlal v3.2D, v12.2S, v11.2S + add x26, x16, x22, lsr #24 + umlal v3.2D, v2.2S, v18.2S + umaddl x16, w10, w17, x21 + umlal v3.2D, v5.2S, v23.2S + add x22, x26, x22, lsr #21 + umlal v9.2D, v4.2S, v23.2S + umaddl x5, w15, w29, x27 + umull v17.2D, v19.2S, v17.2S + umaddl x17, w30, w3, x22 + umlal v25.2D, v2.2S, v8.2S + umaddl x25, w15, w3, x16 + umlal v25.2D, v5.2S, v29.2S + umaddl x26, w15, w7, x19 + umlal v0.2D, v19.2S, v14.2S + umaddl x17, w2, w9, x17 + umlal v17.2D, v15.2S, v8.2S + ldr x19, [tmpb+0] + umlal v17.2D, v4.2S, v29.2S + ldr x7, [tmpb+8] + shl v29.2D, v26.2D, #1 + umaddl x13, w10, w1, x17 + umlal v0.2D, v15.2S, v13.2S + lsr x2, x19, #32 + umlal v29.2D, v12.2S, v13.2S + umaddl x27, w15, w1, x12 + umlal v29.2D, v2.2S, v11.2S + umaddl x30, w15, w8, x13 + umlal v29.2D, v5.2S, v18.2S + add x4, x7, x7 + umlal v29.2D, v16.2S, v23.2S + umaddl x29, w15, w9, x14 + umlal v0.2D, v4.2S, v11.2S + add x17, x27, x30, lsr #26 + umlal v0.2D, v10.2S, v18.2S + umaddl x16, w15, w11, x28 + umlal v0.2D, v6.2S, v23.2S + add x1, x29, x17, lsr #25 + umlal v25.2D, v16.2S, v1.2S + umull x11, w19, w4 + ldr x8, [tmpb+32] + mul v26.2S, v14.2S, v31.2S + umlal v17.2D, v10.2S, v1.2S + ldr x15, [tmpb+16] + umlal v17.2D, v6.2S, v20.2S + and x9, x30, #0x3ffffff + bfi x9, x17, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE aa + add x17, x2, x2 + lsr x10, x15, #32 + add x27, x25, x1, lsr #26 + umlal v25.2D, v7.2S, v20.2S + add x13, x10, x10 + umlal v25.2D, v19.2S, v13.2S + add x29, x23, x27, lsr #25 + umlal v25.2D, v15.2S, v11.2S + lsr x30, x8, #32 + umlal v25.2D, v4.2S, v18.2S + add x23, x5, x29, lsr #26 + umlal v25.2D, v10.2S, v23.2S + and x14, x29, #0x3ffffff + umlal v25.2D, v6.2S, v26.2S + add x5, x16, x23, lsr #25 + shl v8.2D, v17.2D, #1 + umaddl x12, w2, w17, x11 + and x29, x5, #0x3ffffff + umull x21, w19, w19 + umlal v29.2D, v7.2S, v26.2S + add w16, w10, w10, lsl #1; + umlal v3.2D, v16.2S, v26.2S + add w16, w16, w10, lsl #4 + bfi x14, x23, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE aa + add w10, w24, w24, lsl #1; + add x22, x26, x5, lsr #26 + add w10, w10, w24, lsl #4 + umlal v8.2D, v12.2S, v14.2S + umaddl x25, w16, w13, x21 + umlal v8.2D, v2.2S, v13.2S + bfi x29, x22, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE aa + umlal v8.2D, v5.2S, v11.2S + add x26, x24, x24 + umlal v8.2D, v16.2S, v18.2S + stp x14, x29, [tmpa+16] + umlal v8.2D, v7.2S, v23.2S + add w24, w30, w30, lsl #1; + usra v25.2D, v29.2D, #26 + add w24, w24, w30, lsl #4 + umull x29, w15, w15 + umlal v27.2D, v2.2S, v14.2S + umull x3, w15, w13 + umlal v27.2D, v5.2S, v13.2S + add x21, x20, x20 + umlal v24.2D, v15.2S, v14.2S + umull x5, w19, w21 + umlal v24.2D, v4.2S, v13.2S + and x11, x1, #0x3ffffff + usra v8.2D, v25.2D, #25 + and x1, x0, #0x1ffffff + umlal v27.2D, v16.2S, v11.2S + umaddl x23, w17, w13, x5 + umlal v27.2D, v7.2S, v18.2S + add x5, x30, x30 + usra v0.2D, v8.2D, #26 + add x0, x15, x15 + umlal v24.2D, v10.2S, v11.2S + umaddl x23, w7, w0, x23 + umlal v24.2D, v6.2S, v18.2S + lsr x30, x7, #32 + usra v27.2D, v0.2D, #25 + add x16, x30, x30 + and v20.16B, v8.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = bc|ad + umaddl x15, w30, w16, x23 + ushr v23.2D, v30.2D, #1 + add w23, w8, w8, lsl #1; + usra v24.2D, v27.2D, #26 + add w23, w23, w8, lsl #4 + umaddl x14, w19, w5, x3 + and v8.16B, v27.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = bc|ad + add x28, x8, x8 + and v27.16B, v0.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = bc|ad + umaddl x8, w8, w23, x15 + and v5.16B, v24.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = bc|ad + umaddl x3, w2, w28, x14 + umlal v22.2D, v15.2S, v28.2S + bfi x11, x27, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE aa + uzp1 v5.4S, v8.4S, v5.4S + umaddl x14, w24, w5, x29 + umaddl x5, w19, w28, x14 + ldr d18, [mask1] + mov v18.d[1], v18.d[0] + umaddl x15, w7, w26, x3 + mul v12.2S, v13.2S, v31.2S + umlal v21.2D, v16.2S, v13.2S + stp x9, x11, [tmpa+0] + umlal v21.2D, v7.2S, v11.2S + umaddl x29, w17, w26, x5 + umlal v22.2D, v4.2S, v14.2S + add w14, w20, w20, lsl #1; + umlal v22.2D, v10.2S, v13.2S + add w14, w14, w20, lsl #4 + umull x3, w19, w0 + umlal v22.2D, v6.2S, v11.2S + umaddl x29, w7, w21, x29 + usra v21.2D, v24.2D, #25 + umaddl x11, w20, w14, x12 + and v0.16B, v25.16B, v23.16B + umaddl x5, w30, w21, x15 + and v14.16B, v29.16B, v30.16B + umaddl x12, w16, w13, x29 + usra v22.2D, v21.2D, #26 + umaddl x29, w17, w16, x3 + umlal v3.2D, v7.2S, v12.2S + add x9, x26, x26 + and v1.16B, v21.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = bc|ad + add x27, x5, x12, lsr #26 + bic v8.16B, v22.16B, v23.16B + umaddl x29, w7, w7, x29 + and v17.16B, v22.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = bc|ad + add x5, x25, x27, lsr #25 + usra v3.2D, v8.2D, #25 + umaddl x25, w24, w9, x8 + umlal v9.2D, v10.2S, v26.2S + add x8, x13, x13 + trn1 v22.4S, v1.4S, v17.4S + umaddl x11, w10, w8, x11 + usra v3.2D, v8.2D, #24 + umull x20, w19, w16 + add v26.2S, v22.2S, v18.2S + ldr d28, [mask2] + umlal v9.2D, v6.2S, v12.2S + umaddl x3, w23, w0, x11 + usra v3.2D, v8.2D, #21 + umaddl x29, w10, w26, x29 + uzp1 v11.4S, v20.4S, v27.4S + umaddl x20, w2, w4, x20 + umaddl x9, w10, w21, x20 + mov v17.d[0], v22.d[1] + usra v9.2D, v3.2D, #26 + umull x15, w19, w13 + and v7.16B, v3.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = bc|ad + add x11, x16, x16 + uzp2 v1.4S, v11.4S, v5.4S + umaddl x20, w23, w13, x9 + and v8.16B, v9.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = bc|ad + umaddl x9, w2, w0, x15 + usra v14.2D, v9.2D, #25 + and x6, x6, #0x3ffffff + uzp1 v7.4S, v7.4S, v8.4S + umaddl x29, w23, w21, x29 + uzp1 v27.4S, v11.4S, v5.4S + umull x15, w19, w26 + usra v0.2D, v14.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = bc|ad + add x6, x6, x22, lsr #25 + and v3.16B, v14.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = bc|ad + bic x22, x27, #0x1ffffff + sub v2.2S, v26.2S, v17.2S + add v9.2S, v22.2S, v17.2S + uzp1 v14.4S, v3.4S, v0.4S + umaddl x2, w2, w21, x15 + add v5.4S, v27.4S, v18.4S + add x5, x5, x22, lsr #24 + zip1 v22.2S, v2.2S, v9.2S // ubignum_of_h32reglist 8 + ubignum_of_l32reglist 8 // INTERMEDIATE H|L = t1|t2 + mov v18.b[0], v28.b[0] + uzp1 v8.4S, v7.4S, v14.4S + add x22, x5, x22, lsr #21 + uzp2 v3.4S, v7.4S, v14.4S + umaddl x5, w7, w16, x9 + add v25.4S, v8.4S, v18.4S + umaddl x15, w14, w0, x22 + add v12.4S, v27.4S, v1.4S + add x9, x17, x17 + sub v14.4S, v5.4S, v1.4S + umull x19, w19, w17 + sub v18.4S, v25.4S, v3.4S + ldr x22, [tmpa+8] + add v20.4S, v8.4S, v3.4S + umaddl x15, w10, w11, x15 + zip1 v16.4S, v14.4S, v12.4S // ubignum_of_h32reglist 4 + ubignum_of_l32reglist 4 // INTERMEDIATE H|L = t1|t2 + umaddl x14, w14, w13, x19 + zip2 v14.4S, v14.4S, v12.4S // ubignum_of_h32reglist 6 + ubignum_of_l32reglist 6 // INTERMEDIATE H|L = t1|t2 + and x17, x27, #0x1ffffff + zip2 v0.4S, v18.4S, v20.4S // ubignum_of_h32reglist 2 + ubignum_of_l32reglist 2 // INTERMEDIATE H|L = t1|t2 + umaddl x15, w23, w4, x15 + zip1 v1.4S, v18.4S, v20.4S // ubignum_of_h32reglist 0 + ubignum_of_l32reglist 0 // INTERMEDIATE H|L = t1|t2 + umaddl x10, w10, w0, x14 + zip2 v5.2S, v2.2S, v9.2S // ubignum_of_h32reglist 9 + ubignum_of_l32reglist 9 // INTERMEDIATE H|L = t1|t2 + shl v24.2S, v0.2S, #1 + mov v19.d[0], v1.d[1] // ubignum_of_h32reglist 1 + ubignum_of_l32reglist 1 // INTERMEDIATE H|L = t1|t2 + shl v26.2S, v22.2S, #1 + shl v17.2S, v16.2S, #1 + mov v15.d[0], v0.d[1] // ubignum_of_h32reglist 3 + ubignum_of_l32reglist 3 // INTERMEDIATE H|L = t1|t2 + shl v7.2S, v5.2S, #1 + shl v18.2S, v19.2S, #1 + umull v11.2D, v1.2S, v24.2S + umaddl x19, w23, w16, x10 + umull v6.2D, v1.2S, v17.2S + umaddl x10, w7, w13, x2 + mov v4.d[0], v16.d[1] // ubignum_of_h32reglist 5 + ubignum_of_l32reglist 5 // INTERMEDIATE H|L = t1|t2 + mov v10.d[0], v14.d[1] // ubignum_of_h32reglist 7 + ubignum_of_l32reglist 7 // INTERMEDIATE H|L = t1|t2 + umull v9.2D, v1.2S, v26.2S + ldr x13, [tmpa+0] + shl v28.2S, v15.2S, #1 + shl v3.2S, v10.2S, #1 + ldr x14, [tmpa+16] + mul v12.2S, v10.2S, v31.2S + umull v25.2D, v1.2S, v7.2S + ldr x2, [tmpa+24] + umlal v6.2D, v18.2S, v28.2S + umaddl x27, w30, w0, x10 + umaddl x16, w24, w0, x20 + shl v13.2S, v14.2S, #1 + umaddl x5, w23, w26, x5 + mul v2.2S, v22.2S, v31.2S + umull v21.2D, v1.2S, v13.2S + umaddl x23, w24, w8, x29 + umlal v11.2D, v18.2S, v19.2S + mov x10, #0x07fffffe07fffffe + sub x10, x10, #2 + umaddl x26, w24, w21, x5 + mul v29.2S, v14.2S, v31.2S + umlal v25.2D, v19.2S, v26.2S + add x7, x1, x6, lsr #26 + mul v20.2S, v4.2S, v31.2S + and x6, x6, #0x3ffffff + shl v8.2S, v18.2S, #1 + shl v4.2S, v4.2S, #1 + umlal v11.2D, v29.2S, v14.2S + bfi x6, x7, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE aa + umlal v25.2D, v0.2S, v3.2S + umaddl x0, w24, w4, x19 + umlal v25.2D, v15.2S, v13.2S + str x6, [tmpa+32] + umlal v21.2D, v18.2S, v4.2S + umaddl x8, w24, w11, x3 + umlal v21.2D, v0.2S, v17.2S + ldr x30, [tmpa+32] + mul v14.2S, v5.2S, v31.2S + add x2, x2, x10 + shl v5.2S, v28.2S, #1 + shl v27.2S, v4.2S, #1 + umlal v6.2D, v0.2S, v0.2S + umaddl x11, w24, w9, x15 + umlal v6.2D, v12.2S, v3.2S + add x4, x30, x10 + umlal v11.2D, v14.2S, v5.2S + add x3, x22, x10 + umlal v11.2D, v2.2S, v17.2S + add x6, x0, x11, lsr #26 + umlal v11.2D, v12.2S, v27.2S + add x14, x14, x10 + umlal v6.2D, v14.2S, v27.2S + add x8, x8, x6, lsr #25 + umlal v6.2D, v2.2S, v13.2S + movk x10, #0xffb4 + umlal v25.2D, v16.2S, v4.2S + add x29, x16, x8, lsr #26 + umull v27.2D, v1.2S, v3.2S + and x11, x11, #0x3ffffff + umlal v9.2D, v18.2S, v3.2S + add x19, x13, x10 + umlal v9.2D, v0.2S, v13.2S + and x5, x8, #0x3ffffff + umlal v9.2D, v28.2S, v4.2S + bfi x11, x6, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE bb + umlal v9.2D, v16.2S, v16.2S + umaddl x30, w24, w28, x27 + umlal v9.2D, v14.2S, v7.2S + sub x13, x19, x11 + umull v10.2D, v1.2S, v18.2S + add x7, x23, x29, lsr #25 + umlal v21.2D, v28.2S, v15.2S + lsr x16, x13, #32 // ubignum_of_wreglist 1 + ubignum_of_wreglist 0 // INTERMEDIATE e + umlal v21.2D, v2.2S, v22.2S + add x0, x26, x7, lsr #26 + usra v25.2D, v9.2D, #26 + and x20, x7, #0x3ffffff + umull v22.2D, v1.2S, v1.2S + add x8, x25, x0, lsr #25 + umull v7.2D, v1.2S, v28.2S + and x1, x29, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bbalt + bic v18.16B, v25.16B, v23.16B + and x19, x8, #0x3ffffff + and v16.16B, v9.16B, v30.16B + and x7, x12, #0x3ffffff + usra v22.2D, v18.2D, #25 + add x10, x30, x8, lsr #26 + umlal v7.2D, v19.2S, v24.2S + bfi x5, x29, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE bb + and v9.16B, v25.16B, v23.16B + add x27, x7, x10, lsr #25 + usra v22.2D, v18.2D, #24 + mov x21, #60833 + lsl x21, x21, #1 + add x15, x17, x27, lsr #26 + shl v25.2S, v3.2S, #1 + umlal v7.2D, v14.2S, v17.2S + and x29, x27, #0x3ffffff + usra v22.2D, v18.2D, #21 + bfi x29, x15, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE bb // ***SOURCE*** ubignum_of_xreglist 9 // INTERMEDIATE bbalt + umlal v10.2D, v14.2S, v24.2S + and x17, x6, #0x1ffffff // ubignum_of_xreglist 1 // INTERMEDIATE bbalt + umlal v10.2D, v2.2S, v28.2S + sub x6, x3, x5 + umlal v10.2D, v12.2S, v17.2S + umaddl x25, w16, w21, x17 + umlal v10.2D, v29.2S, v4.2S + mov w12, w5 // ubignum_of_xreglist 2 // INTERMEDIATE bbalt + umlal v22.2D, v20.2S, v4.2S + lsr x26, x6, #32 // ubignum_of_wreglist 3 + ubignum_of_wreglist 2 // INTERMEDIATE e + umlal v22.2D, v14.2S, v8.2S + and x24, x0, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bbalt + umlal v22.2D, v2.2S, v24.2S + stp x11, x5, [tmpb+0] + umlal v22.2D, v12.2S, v5.2S + bfi x20, x0, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE bb + umlal v22.2D, v29.2S, v17.2S + umaddl x12, w6, w21, x12 + umull v18.2D, v1.2S, v4.2S + bfi x19, x10, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE bb + umlal v7.2D, v2.2S, v4.2S + sub x7, x14, x20 + umlal v27.2D, v19.2S, v13.2S + mov w8, w20 // ubignum_of_xreglist 4 // INTERMEDIATE bbalt + usra v10.2D, v22.2D, #26 + lsr x14, x7, #32 // ubignum_of_wreglist 5 + ubignum_of_wreglist 4 // INTERMEDIATE e + umlal v18.2D, v19.2S, v17.2S + and x28, x10, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bbalt + umlal v7.2D, v12.2S, v13.2S + sub x5, x2, x19 + usra v11.2D, v10.2D, #25 + mov w2, w19 // ubignum_of_xreglist 6 // INTERMEDIATE bbalt + umlal v27.2D, v0.2S, v4.2S + umlal v21.2D, v14.2S, v25.2S + sub x23, x4, x29 + usra v7.2D, v11.2D, #26 + mov w0, w29 // ubignum_of_xreglist 8 // INTERMEDIATE bbalt + umlal v18.2D, v0.2S, v28.2S + lsr x22, x23, #32 // ubignum_of_wreglist 9 + ubignum_of_wreglist 8 // INTERMEDIATE e + umlal v27.2D, v15.2S, v17.2S + str x29, [tmpb+32] + usra v6.2D, v7.2D, #25 + mov w17, w11 // ubignum_of_xreglist 0 // INTERMEDIATE bbalt + and v0.16B, v22.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x5|t3 + umaddl x27, w26, w21, x1 + umlal v18.2D, v14.2S, v13.2S + umaddl x30, w23, w21, x0 + umlal v18.2D, v2.2S, v3.2S + lsr x10, x5, #32 // ubignum_of_wreglist 7 + ubignum_of_wreglist 6 // INTERMEDIATE e + and v4.16B, v6.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x5|t3 + and v1.16B, v10.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x5|t3 + umaddl x4, w14, w21, x24 + ldr x0, [tmpa+0] + mov v0.s[1], w0 + lsr x0, x0, #32 + mov v1.s[1], w0 + umaddl x9, w7, w21, x8 + usra v18.2D, v6.2D, #26 + umaddl x24, w10, w21, x28 + and v3.16B, v7.16B, v23.16B // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x5|t3 + umaddl x8, w22, w21, x15 + umlal v27.2D, v14.2S, v26.2S + umaddl x15, w13, w21, x17 + usra v21.2D, v18.2D, #25 + stp x20, x19, [tmpb+16] + and v2.16B, v11.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x5|t3 + lsr x29, x8, #25 + ldr x3, [tmpb+0] + mov v10.s[1], w3 + lsr x3, x3, #32 + mov v11.s[1], w3 + add x17, x15, x29 + usra v27.2D, v21.2D, #26 + add x28, x17, x29, lsl #1 + and v6.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x5|t3 + and x20, x8, #0x1ffffff + and v5.16B, v18.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x5|t3 + add x17, x28, x29, lsl #4 + and v7.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x5|t3 + ldr x3, [tmpb+8] + mov v22.s[1], w3 + lsr x3, x3, #32 + mov v23.s[1], w3 + add x29, x25, x17, lsr #26 + ldr x15, [pointx+0] + mov v10.s[0], w15 + lsr x15, x15, #32 + mov v11.s[0], w15 + and x11, x17, #0x3ffffff // ubignum_of_xreglist 0 // INTERMEDIATE bce + usra v16.2D, v27.2D, #25 + add x8, x12, x29, lsr #25 + ldr x3, [tmpb+16] + mov v14.s[1], w3 + lsr x3, x3, #32 + mov v15.s[1], w3 + and x12, x29, #0x1ffffff // ubignum_of_xreglist 1 // INTERMEDIATE bce + ldr x15, [pointx+8] + mov v22.s[0], w15 + lsr x15, x15, #32 + mov v23.s[0], w15 + add x28, x27, x8, lsr #26 + and v8.16B, v16.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + umull x1, w12, w10 + ldr x3, [tmpb+24] + mov v17.s[1], w3 + lsr x3, x3, #32 + mov v18.s[1], w3 + add x25, x9, x28, lsr #25 + ldr x15, [pointx+16] + mov v14.s[0], w15 + lsr x15, x15, #32 + mov v15.s[0], w15 + umaddl x19, w5, w21, x2 + usra v9.2D, v16.2D, #26 // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + add x2, x4, x25, lsr #26 + ldr x3, [tmpb+32] + mov v24.s[1], w3 + lsr x3, x3, #32 + mov v25.s[1], w3 + umull x3, w12, w23 + ldr x15, [pointx+24] + mov v17.s[0], w15 + lsr x15, x15, #32 + mov v18.s[0], w15 + add x29, x19, x2, lsr #25 + umull v26.2D, v0.2S, v23.2S + and x21, x28, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bce + ldr x0, [tmpa+8] + mov v2.s[1], w0 + lsr x0, x0, #32 + mov v3.s[1], w0 + umaddl x27, w21, w5, x3 + ldr x15, [pointx+32] + mov v24.s[0], w15 + lsr x15, x15, #32 + mov v25.s[0], w15 + add x17, x24, x29, lsr #26 + umull v29.2D, v1.2S, v18.2S + and x15, x8, #0x3ffffff // ubignum_of_xreglist 2 // INTERMEDIATE bce + umull v20.2D, v0.2S, v15.2S + add x19, x30, x17, lsr #25 + and x3, x17, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bce + mul v12.2S, v25.2S, v31.2S + ldr x0, [tmpa+16] + mov v4.s[1], w0 + lsr x0, x0, #32 + mov v5.s[1], w0 + add x4, x20, x19, lsr #26 // ubignum_of_xreglist 9 // INTERMEDIATE bce + umlal v26.2D, v2.2S, v11.2S + add w28, w3, w3, lsl #1; + umlal v20.2D, v2.2S, v23.2S + add w28, w28, w3, lsl #4 + umull x8, w12, w5 + ldr x0, [tmpa+24] + mov v6.s[1], w0 + lsr x0, x0, #32 + mov v7.s[1], w0 + and x30, x25, #0x3ffffff // ubignum_of_xreglist 4 // INTERMEDIATE bce + mul v16.2S, v18.2S, v31.2S + add w17, w4, w4, lsl #1; + umull v21.2D, v1.2S, v15.2S + add w17, w17, w4, lsl #4 + umaddl x25, w21, w7, x8 + umlal v20.2D, v4.2S, v11.2S + add w8, w21, w21, lsl #1; + ldr x0, [tmpa+32] + add w8, w8, w21, lsl #4 + mov v8.s[1], w0 + lsr x0, x0, #32 + mov v9.s[1], w0 + and x2, x2, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bce + umlal v29.2D, v3.2S, v15.2S + umaddl x24, w2, w6, x25 + umull v13.2D, v0.2S, v25.2S + umaddl x25, w2, w7, x27 + umaddl x0, w3, w6, x25 + mul v19.2S, v15.2S, v31.2S + umull v27.2D, v0.2S, v18.2S + umaddl x20, w3, w13, x24 + umlal v20.2D, v6.2S, v12.2S + umaddl x24, w21, w14, x1 + umlal v13.2D, v2.2S, v18.2S + umaddl x9, w4, w13, x0 + umull v25.2D, v0.2S, v11.2S + umaddl x20, w17, w23, x20 + umlal v27.2D, v2.2S, v15.2S + umaddl x0, w2, w26, x24 + umull v28.2D, v1.2S, v11.2S + umull x24, w17, w5 + umlal v29.2D, v5.2S, v23.2S + umaddl x9, w11, w22, x9 + umlal v13.2D, v4.2S, v15.2S + umaddl x27, w3, w16, x0 + umlal v27.2D, v4.2S, v23.2S + umull x0, w17, w14 + umlal v27.2D, v6.2S, v11.2S + umull x4, w12, w14 + umlal v27.2D, v8.2S, v12.2S + umaddl x25, w11, w10, x20 + umlal v27.2D, v1.2S, v17.2S + umaddl x0, w28, w10, x0 + umlal v13.2D, v6.2S, v23.2S + umull x3, w17, w6 + umlal v13.2D, v8.2S, v11.2S + umaddl x1, w21, w26, x4 + umlal v20.2D, v8.2S, v16.2S + umaddl x4, w2, w13, x24 + umlal v28.2D, v3.2S, v12.2S + umaddl x20, w28, w7, x3 + umlal v29.2D, v7.2S, v11.2S + and x3, x19, #0x3ffffff // ubignum_of_xreglist 9 // INTERMEDIATE bce + umlal v29.2D, v9.2S, v12.2S + umaddl x19, w17, w22, x27 + add w27, w2, w2, lsl #1; + mul v18.2S, v24.2S, v31.2S + add w27, w27, w2, lsl #4 + umlal v21.2D, v3.2S, v23.2S + umull x24, w17, w7 + umlal v13.2D, v1.2S, v24.2S + add x19, x19, x19 + shl v29.2D, v29.2D, #1 + umaddl x1, w2, w16, x1 + umull v15.2D, v1.2S, v23.2S + umaddl x0, w27, w22, x0 + umlal v29.2D, v0.2S, v24.2S + umaddl x2, w28, w5, x24 + mul v24.2S, v23.2S, v31.2S + umaddl x4, w28, w23, x4 + umlal v21.2D, v5.2S, v11.2S + umaddl x24, w27, w5, x20 + umlal v20.2D, v1.2S, v14.2S + umaddl x20, w11, w23, x19 + umlal v26.2D, v4.2S, v12.2S + umaddl x19, w27, w23, x2 + umlal v26.2D, v6.2S, v16.2S + umaddl x2, w21, w6, x4 + umlal v29.2D, v2.2S, v17.2S + umaddl x24, w8, w23, x24 + umlal v15.2D, v3.2S, v11.2S + umaddl x0, w21, w16, x0 + umaddl x4, w21, w13, x19 + mul v23.2S, v11.2S, v31.2S + umlal v20.2D, v3.2S, v22.2S + umaddl x2, w12, w7, x2 + umlal v20.2D, v5.2S, v10.2S + umaddl x19, w12, w26, x0 + umlal v29.2D, v4.2S, v14.2S + umaddl x0, w12, w13, x24 + umlal v26.2D, v8.2S, v19.2S + umaddl x20, w15, w5, x20 + umlal v26.2D, v1.2S, v22.2S + umaddl x21, w15, w10, x9 + umlal v26.2D, v3.2S, v10.2S + and x9, x29, #0x3ffffff // ubignum_of_xreglist 6 // INTERMEDIATE bce + umlal v29.2D, v6.2S, v22.2S + umaddl x20, w30, w7, x20 + umaddl x1, w28, w22, x1 + add x24, x19, x19 + umull v11.2D, v1.2S, v12.2S + add w19, w3, w3, lsl #1; + umlal v26.2D, v5.2S, v18.2S + add w19, w19, w3, lsl #4 + umaddl x20, w9, w6, x20 + umlal v29.2D, v8.2S, v10.2S + add w29, w9, w9, lsl #1; + umlal v13.2D, v3.2S, v17.2S + add w29, w29, w9, lsl #4 + umaddl x2, w19, w10, x2 + umlal v11.2D, v3.2S, v16.2S + umaddl x21, w30, w14, x21 + umlal v11.2D, v5.2S, v19.2S + umaddl x20, w3, w13, x20 + umlal v11.2D, v7.2S, v24.2S + umaddl x2, w29, w22, x2 + umlal v11.2D, v9.2S, v23.2S + umaddl x21, w9, w26, x21 + ushr v23.2D, v30.2D, #1 + umaddl x1, w17, w10, x1 + umlal v13.2D, v5.2S, v14.2S + umaddl x24, w19, w5, x24 + umlal v27.2D, v3.2S, v14.2S + umaddl x21, w3, w16, x21 + shl v11.2D, v11.2D, #1 + add w3, w30, w30, lsl #1; + umlal v28.2D, v5.2S, v16.2S + add w3, w3, w30, lsl #4 + umaddl x24, w29, w23, x24 + umlal v28.2D, v7.2S, v19.2S + add x1, x1, x1 + umlal v28.2D, v9.2S, v24.2S + umaddl x1, w11, w5, x1 + umlal v15.2D, v5.2S, v12.2S + umaddl x24, w30, w13, x24 + umlal v15.2D, v7.2S, v16.2S + umaddl x25, w15, w14, x25 + umlal v15.2D, v9.2S, v19.2S + umaddl x1, w15, w7, x1 + shl v28.2D, v28.2D, #1 + umaddl x24, w15, w6, x24 + umlal v21.2D, v7.2S, v12.2S + umaddl x2, w30, w16, x2 + umlal v21.2D, v9.2S, v16.2S + umaddl x25, w30, w26, x25 + shl v15.2D, v15.2D, #1 + umaddl x30, w30, w6, x1 + umlal v28.2D, v0.2S, v22.2S + umaddl x1, w15, w26, x2 + umlal v28.2D, v2.2S, v10.2S + umaddl x2, w9, w16, x25 + shl v21.2D, v21.2D, #1 + umaddl x24, w11, w7, x24 + umlal v15.2D, v0.2S, v14.2S + umaddl x1, w11, w14, x1 + umlal v21.2D, v0.2S, v17.2S + umaddl x25, w9, w13, x30 + umlal v28.2D, v4.2S, v18.2S + umaddl x0, w19, w26, x0 + umlal v25.2D, v2.2S, v12.2S + add x1, x1, x24, lsr #26 + umlal v25.2D, v4.2S, v16.2S + umaddl x30, w19, w22, x2 + umlal v21.2D, v2.2S, v14.2S + umaddl x4, w12, w6, x4 + mul v14.2S, v14.2S, v31.2S + umaddl x25, w19, w23, x25 + and x2, x1, #0x1ffffff + mul v16.2S, v17.2S, v31.2S + umlal v25.2D, v6.2S, v19.2S + umaddl x9, w19, w14, x4 + umlal v13.2D, v7.2S, v22.2S + add x25, x25, x1, lsr #25 + umlal v21.2D, v4.2S, v22.2S + umaddl x0, w29, w14, x0 + umlal v26.2D, v7.2S, v16.2S + add x30, x30, x25, lsr #26 + umlal v26.2D, v9.2S, v14.2S + add w1, w15, w15, lsl #1; + umlal v28.2D, v6.2S, v16.2S + add w1, w1, w15, lsl #4 + add x4, x20, x30, lsr #25 + umlal v28.2D, v8.2S, v14.2S + and x25, x25, #0x3ffffff + umlal v15.2D, v2.2S, v22.2S + add x21, x21, x4, lsr #26 + umlal v11.2D, v0.2S, v10.2S + bfi x25, x30, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE z4 + umlal v11.2D, v2.2S, v18.2S + bic x30, x21, #0x3ffffff + usra v26.2D, v28.2D, #26 + lsr x20, x30, #26 + umlal v15.2D, v4.2S, v10.2S + add x20, x20, x30, lsr #25 + umlal v15.2D, v6.2S, v18.2S + umaddl x9, w29, w10, x9 + umlal v15.2D, v8.2S, v16.2S + add x30, x20, x30, lsr #22 + umlal v27.2D, v5.2S, v22.2S + umull x20, w17, w26 + umlal v20.2D, v7.2S, v18.2S + umaddl x30, w17, w16, x30 + umlal v20.2D, v9.2S, v16.2S + umaddl x17, w3, w10, x0 + usra v15.2D, v26.2D, #25 + umaddl x0, w28, w14, x20 + umlal v27.2D, v7.2S, v10.2S + umaddl x20, w28, w26, x30 + umlal v27.2D, v9.2S, v18.2S + add w28, w12, w12, lsl #1; + usra v20.2D, v15.2D, #26 + add w28, w28, w12, lsl #4 + umaddl x30, w27, w10, x0 + and v17.16B, v15.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x4|z5 + umaddl x27, w27, w14, x20 + umaddl x0, w8, w10, x27 + mul v12.2S, v22.2S, v31.2S + and v15.16B, v20.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x4|z5 + umaddl x14, w3, w22, x9 + umlal v21.2D, v6.2S, v10.2S + umaddl x27, w8, w22, x30 + trn1 v15.4S, v17.4S, v15.4S // FINAL z3 + umaddl x10, w28, w22, x0 + umlal v11.2D, v4.2S, v16.2S + umaddl x30, w15, w16, x14 + and v26.16B, v26.16B, v23.16B + umaddl x28, w12, w16, x27 + umlal v21.2D, v8.2S, v18.2S + add x10, x10, x10 + umlal v25.2D, v8.2S, v24.2S + umaddl x20, w19, w6, x10 + umlal v25.2D, v1.2S, v10.2S + add x28, x28, x28 + umlal v25.2D, v3.2S, v18.2S + umaddl x28, w19, w7, x28 + usra v21.2D, v20.2D, #25 + umaddl x0, w29, w7, x20 + umlal v11.2D, v6.2S, v14.2S + umaddl x10, w11, w26, x30 + umlal v13.2D, v9.2S, v10.2S + umaddl x19, w29, w5, x28 + usra v27.2D, v21.2D, #26 + umaddl x0, w3, w5, x0 + umlal v25.2D, v5.2S, v16.2S + umaddl x20, w1, w22, x17 + and v20.16B, v28.16B, v30.16B + umaddl x29, w3, w23, x19 + usra v29.2D, v27.2D, #25 + umaddl x3, w1, w23, x0 + and v27.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x4|z5 + umlal v11.2D, v8.2S, v12.2S + umaddl x12, w15, w13, x29 + usra v13.2D, v29.2D, #26 + umaddl x7, w11, w13, x3 + trn1 v6.4S, v6.4S, v7.4S + umaddl x17, w11, w16, x20 + umlal v25.2D, v7.2S, v14.2S + and x23, x4, #0x3ffffff + bic v19.16B, v13.16B, v23.16B + umaddl x19, w11, w6, x12 + and v28.16B, v13.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = x4|z5 + add x3, x17, x7, lsr #26 + usra v11.2D, v19.2D, #25 + trn1 v2.4S, v2.4S, v3.4S + add x17, x19, x3, lsr #25 + and v13.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x4|z5 + and x5, x7, #0x3ffffff + usra v11.2D, v19.2D, #24 + add x7, x10, x17, lsr #26 + trn1 v0.4S, v0.4S, v1.4S + and x19, x24, #0x3ffffff + and v21.16B, v29.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x4|z5 + add x29, x19, x7, lsr #25 + usra v11.2D, v19.2D, #21 + bfi x5, x3, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE z4 + trn1 v17.4S, v13.4S, v27.4S // FINAL z3 + add x19, x2, x29, lsr #26 + trn1 v19.4S, v21.4S, v28.4S // FINAL z3 + and x3, x29, #0x3ffffff + mov v16.d[0], v6.d[1] // FINAL x3 + mov v6.d[0], v17.d[1] // FINAL x2 + trn1 v8.4S, v8.4S, v9.4S + bfi x3, x19, #32, #26 // ubignum_of_preglist 2 // INTERMEDIATE z4 + and v21.16B, v11.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x4|z5 + bfi x23, x21, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE z4 + mov v18.d[0], v8.d[1] // FINAL x3 + mov v8.d[0], v19.d[1] // FINAL x2 + umlal v25.2D, v9.2S, v12.2S + mov v9.d[0], x23 // FINAL z2 + mov v7.d[0], x25 // FINAL z2 + ldr d29, [mask1] + mov v12.d[0], v2.d[1] // FINAL x3 + trn1 v4.4S, v4.4S, v5.4S + and x17, x17, #0x3ffffff + usra v25.2D, v11.2D, #26 + mov v10.d[0], v0.d[1] // FINAL x3 + mov v14.d[0], v4.d[1] // FINAL x3 + mov v4.d[0], v15.d[1] // FINAL x2 + usra v20.2D, v25.2D, #25 + and v27.16B, v25.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 + bfi x17, x7, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE z4 + mov v5.d[0], x3 // depth 86 + mov v1.d[0], x5 // FINAL z2 + usra v26.2D, v20.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 + and v28.16B, v20.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 + trn1 v11.4S, v21.4S, v27.4S // FINAL z3 + trn1 v13.4S, v28.4S, v26.4S // FINAL z3 + mov v0.d[0], v11.d[1] // FINAL x2 + mov v3.d[0], x17 // FINAL z2 + mov v2.d[0], v13.d[1] // FINAL x2 + ldr d28, [mask2] + + ldr x0, [i] + subs x0, x0, #1 + str x0, [i] bcs curve25519_x25519_scalarloop -// Multiplex directly into (xn,zn) then do three pure doubling steps; -// this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. - - cmp swap, xzr - mux_4(xn,xm,xn) - mux_4(zn,zm,zn) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_p25519(zn,p,e) - -// The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn +// Repack X2 into the saturated representation as 256-bit value xn. +// This does not fully normalize mod 2^255-19 but stays within 256 bits. + + mov w0, v0.s[0] + mov w1, v0.s[1] + mov w2, v2.s[0] + mov w3, v2.s[1] + mov w4, v4.s[0] + mov w5, v4.s[1] + mov w6, v6.s[0] + mov w7, v6.s[1] + mov w8, v8.s[0] + mov w9, v8.s[1] + + add x0, x0, x1, lsl #26 + add x1, x2, x3, lsl #26 + add x2, x4, x5, lsl #26 + add x3, x6, x7, lsl #26 + add x4, x8, x9, lsl #26 + + adds x0, x0, x1, lsl #51 + lsr x6, x1, #13 + lsl x7, x2, #38 + adcs x1, x6, x7 + lsr x8, x2, #26 + lsl x9, x3, #25 + adcs x2, x8, x9 + lsr x10, x3, #39 + lsl x11, x4, #12 + adc x3, x10, x11 + stp x0, x1, [xn] + stp x2, x3, [xn+16] - add x0, xm +// Repack Z2 into the saturated representation as 256-bit value zn. +// This does not fully normalize mod 2^255-19. However since Z2, +// unlike X2, was not repacked (within the last multiplication) in +// right-to-left order, its top digit can be any 26-bit value, on +// the face of it. To make sure we don't overflow 256 bits here +// we remove b = 25th bit of the 9th digit (now scaled by 2^230 +// giving bit 25 a final weighting of 2^255) and add 19 * b to +// to the bottom of the sum here to compensate mod 2^255-19. + + mov w0, v1.s[0] + mov w1, v1.s[1] + mov w2, v3.s[0] + mov w3, v3.s[1] + mov w4, v5.s[0] + mov w5, v5.s[1] + mov w6, v7.s[0] + mov w7, v7.s[1] + mov w8, v9.s[0] + mov w9, v9.s[1] + + mov w10, #19 + add x0, x0, x1, lsl #26 + tst x9, #0x2000000 + add x1, x2, x3, lsl #26 + csel x10, x10, xzr, ne + add x2, x4, x5, lsl #26 + and x9, x9, #0x1FFFFFF + add x3, x6, x7, lsl #26 + add x0, x0, x10 + add x4, x8, x9, lsl #26 + + adds x0, x0, x1, lsl #51 + lsr x6, x1, #13 + lsl x7, x2, #38 + adcs x1, x6, x7 + lsr x8, x2, #26 + lsl x9, x3, #25 + adcs x2, x8, x9 + lsr x10, x3, #39 + lsl x11, x4, #12 + adc x3, x10, x11 + stp x0, x1, [zn] + stp x2, x3, [zn+16] + +// Because the lowest bit (indeed, the three lowest bits) of the scalar +// were forced to zero, we know that the projective result of the scalar +// multiplication was in (X2,Z2) and is now (xn,zn) in saturated form. +// Prepare to call the modular inverse function to get zn' = 1/zn. + + add x0, zn add x1, zn // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -860,7 +1348,7 @@ curve25519_x25519_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 128 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn, and zn. mov x20, x0 mov x10, #0xffffffffffffffed @@ -1891,36 +2379,210 @@ curve25519_x25519_invmidloop: stp x0, x1, [x4] stp x2, x5, [x4, #16] -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - ldp x0, x1, [zn] - ldp x2, x3, [zn+16] - orr x0, x0, x1 - orr x2, x2, x3 - orr x4, x0, x2 - cmp x4, xzr - ldp x0, x1, [xn] - csel x0, x0, xzr, ne - csel x1, x1, xzr, ne - ldp x2, x3, [xn+16] - stp x0, x1, [xn] - csel x2, x2, xzr, ne - csel x3, x3, xzr, ne - stp x2, x3, [xn+16] - // Now the result is xn * (1/zn), fully reduced modulo p. - - mul_p25519(resx,xn,xm) - -// Restore stack and registers - - add sp, sp, #NSPACE - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. The multiplication below is just an inlined +// version of bignum_mul_p25519 except for the detailed +// addressing of inputs and outputs + + ldr x17, [res] + + ldp x3, x4, [xn] + ldp x5, x6, [zn] + umull x7, w3, w5 + lsr x0, x3, #32 + umull x15, w0, w5 + lsr x16, x5, #32 + umull x8, w16, w0 + umull x16, w3, w16 + adds x7, x7, x15, lsl #32 + lsr x15, x15, #32 + adc x8, x8, x15 + adds x7, x7, x16, lsl #32 + lsr x16, x16, #32 + adc x8, x8, x16 + mul x9, x4, x6 + umulh x10, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x16, cc + adds x9, x9, x8 + adc x10, x10, xzr + subs x3, x5, x6 + cneg x3, x3, cc + cinv x16, x16, cc + mul x15, x4, x3 + umulh x3, x4, x3 + adds x8, x7, x9 + adcs x9, x9, x10 + adc x10, x10, xzr + cmn x16, #0x1 + eor x15, x15, x16 + adcs x8, x15, x8 + eor x3, x3, x16 + adcs x9, x3, x9 + adc x10, x10, x16 + ldp x3, x4, [xn+16] + ldp x5, x6, [zn+16] + umull x11, w3, w5 + lsr x0, x3, #32 + umull x15, w0, w5 + lsr x16, x5, #32 + umull x12, w16, w0 + umull x16, w3, w16 + adds x11, x11, x15, lsl #32 + lsr x15, x15, #32 + adc x12, x12, x15 + adds x11, x11, x16, lsl #32 + lsr x16, x16, #32 + adc x12, x12, x16 + mul x13, x4, x6 + umulh x14, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x16, cc + adds x13, x13, x12 + adc x14, x14, xzr + subs x3, x5, x6 + cneg x3, x3, cc + cinv x16, x16, cc + mul x15, x4, x3 + umulh x3, x4, x3 + adds x12, x11, x13 + adcs x13, x13, x14 + adc x14, x14, xzr + cmn x16, #0x1 + eor x15, x15, x16 + adcs x12, x15, x12 + eor x3, x3, x16 + adcs x13, x3, x13 + adc x14, x14, x16 + ldp x3, x4, [xn+16] + ldp x15, x16, [xn] + subs x3, x3, x15 + sbcs x4, x4, x16 + csetm x16, cc + ldp x15, x0, [zn] + subs x5, x15, x5 + sbcs x6, x0, x6 + csetm x0, cc + eor x3, x3, x16 + subs x3, x3, x16 + eor x4, x4, x16 + sbc x4, x4, x16 + eor x5, x5, x0 + subs x5, x5, x0 + eor x6, x6, x0 + sbc x6, x6, x0 + eor x16, x0, x16 + adds x11, x11, x9 + adcs x12, x12, x10 + adcs x13, x13, xzr + adc x14, x14, xzr + mul x2, x3, x5 + umulh x0, x3, x5 + mul x15, x4, x6 + umulh x1, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x9, cc + adds x15, x15, x0 + adc x1, x1, xzr + subs x6, x5, x6 + cneg x6, x6, cc + cinv x9, x9, cc + mul x5, x4, x6 + umulh x6, x4, x6 + adds x0, x2, x15 + adcs x15, x15, x1 + adc x1, x1, xzr + cmn x9, #0x1 + eor x5, x5, x9 + adcs x0, x5, x0 + eor x6, x6, x9 + adcs x15, x6, x15 + adc x1, x1, x9 + adds x9, x11, x7 + adcs x10, x12, x8 + adcs x11, x13, x11 + adcs x12, x14, x12 + adcs x13, x13, xzr + adc x14, x14, xzr + cmn x16, #0x1 + eor x2, x2, x16 + adcs x9, x2, x9 + eor x0, x0, x16 + adcs x10, x0, x10 + eor x15, x15, x16 + adcs x11, x15, x11 + eor x1, x1, x16 + adcs x12, x1, x12 + adcs x13, x13, x16 + adc x14, x14, x16 + mov x3, #0x26 + umull x4, w11, w3 + add x4, x4, w7, uxtw + lsr x7, x7, #32 + lsr x11, x11, #32 + umaddl x11, w11, w3, x7 + mov x7, x4 + umull x4, w12, w3 + add x4, x4, w8, uxtw + lsr x8, x8, #32 + lsr x12, x12, #32 + umaddl x12, w12, w3, x8 + mov x8, x4 + umull x4, w13, w3 + add x4, x4, w9, uxtw + lsr x9, x9, #32 + lsr x13, x13, #32 + umaddl x13, w13, w3, x9 + mov x9, x4 + umull x4, w14, w3 + add x4, x4, w10, uxtw + lsr x10, x10, #32 + lsr x14, x14, #32 + umaddl x14, w14, w3, x10 + mov x10, x4 + lsr x0, x14, #31 + mov x5, #0x13 + umaddl x5, w5, w0, x5 + add x7, x7, x5 + adds x7, x7, x11, lsl #32 + extr x3, x12, x11, #32 + adcs x8, x8, x3 + extr x3, x13, x12, #32 + adcs x9, x9, x3 + extr x3, x14, x13, #32 + lsl x5, x0, #63 + eor x10, x10, x5 + adc x10, x10, x3 + mov x3, #0x13 + tst x10, #0x8000000000000000 + csel x3, x3, xzr, pl + subs x7, x7, x3 + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbc x10, x10, xzr + and x10, x10, #0x7fffffffffffffff + stp x7, x8, [x17] + stp x9, x10, [x17, #16] + +// Restore stack and registers (this will zero the tops of Q8...Q15). + + ldp d8, d9, [regsave+0] + ldp d10, d11, [regsave+16] + ldp d12, d13, [regsave+32] + ldp d14, d15, [regsave+48] + ldp x19, x20, [regsave+64] + ldp x21, x22, [regsave+80] + ldp x23, x24, [regsave+96] + ldp x25, x26, [regsave+112] + ldp x27, x28, [regsave+128] + ldp x29, x30, [regsave+144] + add sp, sp, #NSPACE+160 ret #if defined(__linux__) && defined(__ELF__) diff --git a/arm/curve25519/curve25519_x25519_byte.S b/arm/curve25519/curve25519_x25519_byte.S index 3e3c03371d..e6c891284d 100644 --- a/arm/curve25519/curve25519_x25519_byte.S +++ b/arm/curve25519/curve25519_x25519_byte.S @@ -1,6 +1,18 @@ // Copyright Amazon.com, Inc. or its affiliates. All Rights Reserved. // SPDX-License-Identifier: Apache-2.0 OR ISC OR MIT-0 +// ********************************************************************** +// This code is substantially derived from Emil Lenngren's implementation +// +// https://github.com/Emill/X25519-AArch64/blob/master/X25519_AArch64.pdf +// https://github.com/Emill/X25519-AArch64 +// +// and the SLOTHY-based re-engineering of that code by Hanno Becker: +// +// https://eprint.iacr.org/2022/1303.pdf +// https://github.com/slothy-optimizer/slothy/tree/main/paper +// ********************************************************************** + // ---------------------------------------------------------------------------- // The x25519 function for curve25519 (byte array arguments) // Inputs scalar[32] (bytes), point[32] (bytes); output res[32] (bytes) @@ -26,671 +38,53 @@ .text .balign 4 -// Size of individual field elements - -#define NUMSIZE 32 - -// Stable homes for the input result argument during the whole body -// and other variables that are only needed prior to the modular inverse. - -#define res x23 -#define i x20 -#define swap x21 - -// Pointers to result x coord to be written - -#define resx res, #0 - -// Pointer-offset pairs for temporaries on stack with some aliasing. - -#define scalar sp, #(0*NUMSIZE) - -#define pointx sp, #(1*NUMSIZE) +// Pointer-offset pairs for temporaries on stack -#define zm sp, #(2*NUMSIZE) -#define sm sp, #(2*NUMSIZE) -#define dpro sp, #(2*NUMSIZE) +#define scalar sp, #0 +#define pointx sp, #32 +#define mask1 sp, #72 +#define mask2 sp, #80 +#define tmpa sp, #88 +#define tmpb sp, #128 +#define xn sp, #128 +#define zn sp, #160 -#define sn sp, #(3*NUMSIZE) - -#define dm sp, #(4*NUMSIZE) - -#define zn sp, #(5*NUMSIZE) -#define dn sp, #(5*NUMSIZE) -#define e sp, #(5*NUMSIZE) - -#define dmsn sp, #(6*NUMSIZE) -#define p sp, #(6*NUMSIZE) - -#define xm sp, #(7*NUMSIZE) -#define dnsm sp, #(7*NUMSIZE) -#define spro sp, #(7*NUMSIZE) - -#define d sp, #(8*NUMSIZE) - -#define xn sp, #(9*NUMSIZE) -#define s sp, #(9*NUMSIZE) +#define res sp, #192 +#define i sp, #200 +#define swap sp, #208 // Total size to reserve on the stack -#define NSPACE (10*NUMSIZE) - -// Macro wrapping up the basic field operation bignum_mul_p25519, only -// trivially different from a pure function call to that subroutine. - -#define mul_p25519(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umaddl x5, w5, w0, x5; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - mov x3, #0x13; \ - tst x10, #0x8000000000000000; \ - csel x3, x3, xzr, pl; \ - subs x7, x7, x3; \ - sbcs x8, x8, xzr; \ - sbcs x9, x9, xzr; \ - sbc x10, x10, xzr; \ - and x10, x10, #0x7fffffffffffffff; \ - stp x7, x8, [P0]; \ - stp x9, x10, [P0+16] - -// A version of multiplication that only guarantees output < 2 * p_25519. -// This basically skips the +1 and final correction in quotient estimation. - -#define mul_4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x5, x6, [P2]; \ - umull x7, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x8, w16, w0; \ - umull x16, w3, w16; \ - adds x7, x7, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x8, x8, x15; \ - adds x7, x7, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x8, x8, x16; \ - mul x9, x4, x6; \ - umulh x10, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x9, x9, x8; \ - adc x10, x10, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x8, x7, x9; \ - adcs x9, x9, x10; \ - adc x10, x10, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x8, x15, x8; \ - eor x3, x3, x16; \ - adcs x9, x3, x9; \ - adc x10, x10, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x5, x6, [P2+16]; \ - umull x11, w3, w5; \ - lsr x0, x3, #32; \ - umull x15, w0, w5; \ - lsr x16, x5, #32; \ - umull x12, w16, w0; \ - umull x16, w3, w16; \ - adds x11, x11, x15, lsl #32; \ - lsr x15, x15, #32; \ - adc x12, x12, x15; \ - adds x11, x11, x16, lsl #32; \ - lsr x16, x16, #32; \ - adc x12, x12, x16; \ - mul x13, x4, x6; \ - umulh x14, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x16, cc; \ - adds x13, x13, x12; \ - adc x14, x14, xzr; \ - subs x3, x5, x6; \ - cneg x3, x3, cc; \ - cinv x16, x16, cc; \ - mul x15, x4, x3; \ - umulh x3, x4, x3; \ - adds x12, x11, x13; \ - adcs x13, x13, x14; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x15, x15, x16; \ - adcs x12, x15, x12; \ - eor x3, x3, x16; \ - adcs x13, x3, x13; \ - adc x14, x14, x16; \ - ldp x3, x4, [P1+16]; \ - ldp x15, x16, [P1]; \ - subs x3, x3, x15; \ - sbcs x4, x4, x16; \ - csetm x16, cc; \ - ldp x15, x0, [P2]; \ - subs x5, x15, x5; \ - sbcs x6, x0, x6; \ - csetm x0, cc; \ - eor x3, x3, x16; \ - subs x3, x3, x16; \ - eor x4, x4, x16; \ - sbc x4, x4, x16; \ - eor x5, x5, x0; \ - subs x5, x5, x0; \ - eor x6, x6, x0; \ - sbc x6, x6, x0; \ - eor x16, x0, x16; \ - adds x11, x11, x9; \ - adcs x12, x12, x10; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - mul x2, x3, x5; \ - umulh x0, x3, x5; \ - mul x15, x4, x6; \ - umulh x1, x4, x6; \ - subs x4, x4, x3; \ - cneg x4, x4, cc; \ - csetm x9, cc; \ - adds x15, x15, x0; \ - adc x1, x1, xzr; \ - subs x6, x5, x6; \ - cneg x6, x6, cc; \ - cinv x9, x9, cc; \ - mul x5, x4, x6; \ - umulh x6, x4, x6; \ - adds x0, x2, x15; \ - adcs x15, x15, x1; \ - adc x1, x1, xzr; \ - cmn x9, #0x1; \ - eor x5, x5, x9; \ - adcs x0, x5, x0; \ - eor x6, x6, x9; \ - adcs x15, x6, x15; \ - adc x1, x1, x9; \ - adds x9, x11, x7; \ - adcs x10, x12, x8; \ - adcs x11, x13, x11; \ - adcs x12, x14, x12; \ - adcs x13, x13, xzr; \ - adc x14, x14, xzr; \ - cmn x16, #0x1; \ - eor x2, x2, x16; \ - adcs x9, x2, x9; \ - eor x0, x0, x16; \ - adcs x10, x0, x10; \ - eor x15, x15, x16; \ - adcs x11, x15, x11; \ - eor x1, x1, x16; \ - adcs x12, x1, x12; \ - adcs x13, x13, x16; \ - adc x14, x14, x16; \ - mov x3, #0x26; \ - umull x4, w11, w3; \ - add x4, x4, w7, uxtw; \ - lsr x7, x7, #32; \ - lsr x11, x11, #32; \ - umaddl x11, w11, w3, x7; \ - mov x7, x4; \ - umull x4, w12, w3; \ - add x4, x4, w8, uxtw; \ - lsr x8, x8, #32; \ - lsr x12, x12, #32; \ - umaddl x12, w12, w3, x8; \ - mov x8, x4; \ - umull x4, w13, w3; \ - add x4, x4, w9, uxtw; \ - lsr x9, x9, #32; \ - lsr x13, x13, #32; \ - umaddl x13, w13, w3, x9; \ - mov x9, x4; \ - umull x4, w14, w3; \ - add x4, x4, w10, uxtw; \ - lsr x10, x10, #32; \ - lsr x14, x14, #32; \ - umaddl x14, w14, w3, x10; \ - mov x10, x4; \ - lsr x0, x14, #31; \ - mov x5, #0x13; \ - umull x5, w5, w0; \ - add x7, x7, x5; \ - adds x7, x7, x11, lsl #32; \ - extr x3, x12, x11, #32; \ - adcs x8, x8, x3; \ - extr x3, x13, x12, #32; \ - adcs x9, x9, x3; \ - extr x3, x14, x13, #32; \ - lsl x5, x0, #63; \ - eor x10, x10, x5; \ - adc x10, x10, x3; \ - stp x7, x8, [P0]; \ - stp x9, x10, [P0+16] - -// Squaring just giving a result < 2 * p_25519, which is done by -// basically skipping the +1 in the quotient estimate and the final -// optional correction. - -#define sqr_4(P0,P1) \ - ldp x10, x11, [P1]; \ - ldp x12, x13, [P1+16]; \ - umull x2, w10, w10; \ - lsr x14, x10, #32; \ - umull x3, w14, w14; \ - umull x14, w10, w14; \ - adds x2, x2, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x3, x3, x14; \ - umull x4, w11, w11; \ - lsr x14, x11, #32; \ - umull x5, w14, w14; \ - umull x14, w11, w14; \ - mul x15, x10, x11; \ - umulh x16, x10, x11; \ - adds x4, x4, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x5, x5, x14; \ - adds x15, x15, x15; \ - adcs x16, x16, x16; \ - adc x5, x5, xzr; \ - adds x3, x3, x15; \ - adcs x4, x4, x16; \ - adc x5, x5, xzr; \ - umull x6, w12, w12; \ - lsr x14, x12, #32; \ - umull x7, w14, w14; \ - umull x14, w12, w14; \ - adds x6, x6, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x7, x7, x14; \ - umull x8, w13, w13; \ - lsr x14, x13, #32; \ - umull x9, w14, w14; \ - umull x14, w13, w14; \ - mul x15, x12, x13; \ - umulh x16, x12, x13; \ - adds x8, x8, x14, lsl #33; \ - lsr x14, x14, #31; \ - adc x9, x9, x14; \ - adds x15, x15, x15; \ - adcs x16, x16, x16; \ - adc x9, x9, xzr; \ - adds x7, x7, x15; \ - adcs x8, x8, x16; \ - adc x9, x9, xzr; \ - subs x10, x10, x12; \ - sbcs x11, x11, x13; \ - csetm x16, cc; \ - eor x10, x10, x16; \ - subs x10, x10, x16; \ - eor x11, x11, x16; \ - sbc x11, x11, x16; \ - adds x6, x6, x4; \ - adcs x7, x7, x5; \ - adcs x8, x8, xzr; \ - adc x9, x9, xzr; \ - umull x12, w10, w10; \ - lsr x5, x10, #32; \ - umull x13, w5, w5; \ - umull x5, w10, w5; \ - adds x12, x12, x5, lsl #33; \ - lsr x5, x5, #31; \ - adc x13, x13, x5; \ - umull x15, w11, w11; \ - lsr x5, x11, #32; \ - umull x14, w5, w5; \ - umull x5, w11, w5; \ - mul x4, x10, x11; \ - umulh x16, x10, x11; \ - adds x15, x15, x5, lsl #33; \ - lsr x5, x5, #31; \ - adc x14, x14, x5; \ - adds x4, x4, x4; \ - adcs x16, x16, x16; \ - adc x14, x14, xzr; \ - adds x13, x13, x4; \ - adcs x15, x15, x16; \ - adc x14, x14, xzr; \ - adds x4, x2, x6; \ - adcs x5, x3, x7; \ - adcs x6, x6, x8; \ - adcs x7, x7, x9; \ - csetm x16, cc; \ - subs x4, x4, x12; \ - sbcs x5, x5, x13; \ - sbcs x6, x6, x15; \ - sbcs x7, x7, x14; \ - adcs x8, x8, x16; \ - adc x9, x9, x16; \ - mov x10, #0x26; \ - umull x12, w6, w10; \ - add x12, x12, w2, uxtw; \ - lsr x2, x2, #32; \ - lsr x6, x6, #32; \ - umaddl x6, w6, w10, x2; \ - mov x2, x12; \ - umull x12, w7, w10; \ - add x12, x12, w3, uxtw; \ - lsr x3, x3, #32; \ - lsr x7, x7, #32; \ - umaddl x7, w7, w10, x3; \ - mov x3, x12; \ - umull x12, w8, w10; \ - add x12, x12, w4, uxtw; \ - lsr x4, x4, #32; \ - lsr x8, x8, #32; \ - umaddl x8, w8, w10, x4; \ - mov x4, x12; \ - umull x12, w9, w10; \ - add x12, x12, w5, uxtw; \ - lsr x5, x5, #32; \ - lsr x9, x9, #32; \ - umaddl x9, w9, w10, x5; \ - mov x5, x12; \ - lsr x13, x9, #31; \ - mov x11, #0x13; \ - umull x11, w11, w13; \ - add x2, x2, x11; \ - adds x2, x2, x6, lsl #32; \ - extr x10, x7, x6, #32; \ - adcs x3, x3, x10; \ - extr x10, x8, x7, #32; \ - adcs x4, x4, x10; \ - extr x10, x9, x8, #32; \ - lsl x11, x13, #63; \ - eor x5, x5, x11; \ - adc x5, x5, x10; \ - stp x2, x3, [P0]; \ - stp x4, x5, [P0+16] - -// Modular addition with double modulus 2 * p_25519 = 2^256 - 38. -// This only ensures that the result fits in 4 digits, not that it is reduced -// even w.r.t. double modulus. The result is always correct modulo provided -// the sum of the inputs is < 2^256 + 2^256 - 38, so in particular provided -// at least one of them is reduced double modulo. - -#define add_twice4(P0,P1,P2) \ - ldp x3, x4, [P1]; \ - ldp x7, x8, [P2]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x5, x6, [P1+16]; \ - ldp x7, x8, [P2+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - mov x9, #38; \ - csel x9, x9, xzr, cs; \ - adds x3, x3, x9; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [P0]; \ - stp x5, x6, [P0+16] - -// Modular subtraction with double modulus 2 * p_25519 = 2^256 - 38 - -#define sub_twice4(p0,p1,p2) \ - ldp x5, x6, [p1]; \ - ldp x4, x3, [p2]; \ - subs x5, x5, x4; \ - sbcs x6, x6, x3; \ - ldp x7, x8, [p1+16]; \ - ldp x4, x3, [p2+16]; \ - sbcs x7, x7, x4; \ - sbcs x8, x8, x3; \ - mov x4, #38; \ - csel x3, x4, xzr, lo; \ - subs x5, x5, x3; \ - sbcs x6, x6, xzr; \ - sbcs x7, x7, xzr; \ - sbc x8, x8, xzr; \ - stp x5, x6, [p0]; \ - stp x7, x8, [p0+16] - -// Combined z = c * x + y with reduction only < 2 * p_25519 -// where c is initially in the X1 register. It is assumed -// that 19 * (c * x + y) < 2^60 * 2^256 so we don't need a -// high mul in the final part. - -#define cmadd_4(p0,p2,p3) \ - ldp x7, x8, [p2]; \ - ldp x9, x10, [p2+16]; \ - mul x3, x1, x7; \ - mul x4, x1, x8; \ - mul x5, x1, x9; \ - mul x6, x1, x10; \ - umulh x7, x1, x7; \ - umulh x8, x1, x8; \ - umulh x9, x1, x9; \ - umulh x10, x1, x10; \ - adds x4, x4, x7; \ - adcs x5, x5, x8; \ - adcs x6, x6, x9; \ - adc x10, x10, xzr; \ - ldp x7, x8, [p3]; \ - adds x3, x3, x7; \ - adcs x4, x4, x8; \ - ldp x7, x8, [p3+16]; \ - adcs x5, x5, x7; \ - adcs x6, x6, x8; \ - adc x10, x10, xzr; \ - cmn x6, x6; \ - bic x6, x6, #0x8000000000000000; \ - adc x8, x10, x10; \ - mov x9, #19; \ - mul x7, x8, x9; \ - adds x3, x3, x7; \ - adcs x4, x4, xzr; \ - adcs x5, x5, xzr; \ - adc x6, x6, xzr; \ - stp x3, x4, [p0]; \ - stp x5, x6, [p0+16] - -// Multiplex: z := if NZ then x else y - -#define mux_4(p0,p1,p2) \ - ldp x0, x1, [p1]; \ - ldp x2, x3, [p2]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ - stp x0, x1, [p0]; \ - ldp x0, x1, [p1+16]; \ - ldp x2, x3, [p2+16]; \ - csel x0, x0, x2, ne; \ - csel x1, x1, x3, ne; \ - stp x0, x1, [p0+16] +#define NSPACE 224 +#define regsave sp, #NSPACE S2N_BN_SYMBOL(curve25519_x25519_byte): -// Save regs and make room for temporaries - - stp x19, x20, [sp, -16]! - stp x21, x22, [sp, -16]! - stp x23, x24, [sp, -16]! - sub sp, sp, #NSPACE +// Save registers and make additional room NSPACE for temporaries. +// We only need to save the low 64-bits of the Q8...Q15 registers +// according to the ABI, so we use a save of the D8...D15 forms. + + sub sp, sp, #NSPACE+160 + stp d8, d9, [regsave+0] + stp d10, d11, [regsave+16] + stp d12, d13, [regsave+32] + stp d14, d15, [regsave+48] + stp x19, x20, [regsave+64] + stp x21, x22, [regsave+80] + stp x23, x24, [regsave+96] + stp x25, x26, [regsave+112] + stp x27, x28, [regsave+128] + stp x29, x30, [regsave+144] // Move the output pointer to a stable place - mov res, x0 + str x0, [res] -// Copy the inputs to the local variables with minimal mangling: -// -// - The scalar is in principle turned into 01xxx...xxx000 but -// in the structure below the special handling of these bits is -// explicit in the main computation; the scalar is just copied. -// -// - The point x coord is reduced mod 2^255 by masking off the -// top bit. In the main loop we only need reduction < 2 * p_25519. +// Copy the scalar to the corresponding local variable while +// mangling it. In principle it becomes 01xxx...xxx000 where +// the xxx are the corresponding bits of the original input +// scalar. We actually don't bother forcing the MSB to zero, +// but rather start the main loop below at 254 instead of 255. ldrb w10, [x1] ldrb w0, [x1, #1] @@ -722,6 +116,7 @@ S2N_BN_SYMBOL(curve25519_x25519_byte): orr x11, x11, x0, lsl #48 ldrb w0, [x1, #15] orr x11, x11, x0, lsl #56 + bic x10, x10, #7 stp x10, x11, [scalar] ldrb w12, [x1, #16] @@ -754,223 +149,1316 @@ S2N_BN_SYMBOL(curve25519_x25519_byte): orr x13, x13, x0, lsl #48 ldrb w0, [x1, #31] orr x13, x13, x0, lsl #56 + orr x13, x13, #0x4000000000000000 stp x12, x13, [scalar+16] - ldrb w10, [x2] +// Discard the MSB of the point X coordinate (this is in +// accordance with the RFC, mod 2^255, *not* 2^255-19). +// Then recode it into the unsaturated base 25.5 form. + + ldrb w4, [x2] ldrb w0, [x2, #1] - orr x10, x10, x0, lsl #8 + orr x4, x4, x0, lsl #8 ldrb w0, [x2, #2] - orr x10, x10, x0, lsl #16 + orr x4, x4, x0, lsl #16 ldrb w0, [x2, #3] - orr x10, x10, x0, lsl #24 + orr x4, x4, x0, lsl #24 ldrb w0, [x2, #4] - orr x10, x10, x0, lsl #32 + orr x4, x4, x0, lsl #32 ldrb w0, [x2, #5] - orr x10, x10, x0, lsl #40 + orr x4, x4, x0, lsl #40 ldrb w0, [x2, #6] - orr x10, x10, x0, lsl #48 + orr x4, x4, x0, lsl #48 ldrb w0, [x2, #7] - orr x10, x10, x0, lsl #56 - ldrb w11, [x2, #8] + orr x4, x4, x0, lsl #56 + ldrb w5, [x2, #8] ldrb w0, [x2, #9] - orr x11, x11, x0, lsl #8 + orr x5, x5, x0, lsl #8 ldrb w0, [x2, #10] - orr x11, x11, x0, lsl #16 + orr x5, x5, x0, lsl #16 ldrb w0, [x2, #11] - orr x11, x11, x0, lsl #24 + orr x5, x5, x0, lsl #24 ldrb w0, [x2, #12] - orr x11, x11, x0, lsl #32 + orr x5, x5, x0, lsl #32 ldrb w0, [x2, #13] - orr x11, x11, x0, lsl #40 + orr x5, x5, x0, lsl #40 ldrb w0, [x2, #14] - orr x11, x11, x0, lsl #48 + orr x5, x5, x0, lsl #48 ldrb w0, [x2, #15] - orr x11, x11, x0, lsl #56 - stp x10, x11, [pointx] + orr x5, x5, x0, lsl #56 - ldrb w12, [x2, #16] + ldrb w6, [x2, #16] ldrb w0, [x2, #17] - orr x12, x12, x0, lsl #8 + orr x6, x6, x0, lsl #8 ldrb w0, [x2, #18] - orr x12, x12, x0, lsl #16 + orr x6, x6, x0, lsl #16 ldrb w0, [x2, #19] - orr x12, x12, x0, lsl #24 + orr x6, x6, x0, lsl #24 ldrb w0, [x2, #20] - orr x12, x12, x0, lsl #32 + orr x6, x6, x0, lsl #32 ldrb w0, [x2, #21] - orr x12, x12, x0, lsl #40 + orr x6, x6, x0, lsl #40 ldrb w0, [x2, #22] - orr x12, x12, x0, lsl #48 + orr x6, x6, x0, lsl #48 ldrb w0, [x2, #23] - orr x12, x12, x0, lsl #56 - ldrb w13, [x2, #24] + orr x6, x6, x0, lsl #56 + ldrb w7, [x2, #24] ldrb w0, [x2, #25] - orr x13, x13, x0, lsl #8 + orr x7, x7, x0, lsl #8 ldrb w0, [x2, #26] - orr x13, x13, x0, lsl #16 + orr x7, x7, x0, lsl #16 ldrb w0, [x2, #27] - orr x13, x13, x0, lsl #24 + orr x7, x7, x0, lsl #24 ldrb w0, [x2, #28] - orr x13, x13, x0, lsl #32 + orr x7, x7, x0, lsl #32 ldrb w0, [x2, #29] - orr x13, x13, x0, lsl #40 + orr x7, x7, x0, lsl #40 ldrb w0, [x2, #30] - orr x13, x13, x0, lsl #48 + orr x7, x7, x0, lsl #48 ldrb w0, [x2, #31] - orr x13, x13, x0, lsl #56 - and x13, x13, #0x7fffffffffffffff + orr x7, x7, x0, lsl #56 + + lsr x12, x4, #51 + lsr x17, x6, #51 + orr x12, x12, x5, lsl #13 + orr x17, x17, x7, lsl #13 + ubfx x8, x7, #12, #26 + ubfx x9, x7, #38, #25 + ubfx x11, x4, #26, #25 + ubfx x13, x5, #13, #25 + lsr x14, x5, #38 + ubfx x16, x6, #25, #26 + and x10, x4, #0x3ffffff + and x12, x12, #0x3ffffff + and x15, x6, #0x1ffffff + and x17, x17, #0x1ffffff + orr x10, x10, x11, lsl #32 + orr x11, x12, x13, lsl #32 + orr x12, x14, x15, lsl #32 + orr x13, x16, x17, lsl #32 + orr x14, x8, x9, lsl #32 + + stp x10, x11, [pointx+0] stp x12, x13, [pointx+16] + str x14, [pointx+32] + +// Initialize (X2,Z2) = (1,0), the identity (projective point at infinity) + + mov x1, #1 + mov v0.d[0], x1 + mov v2.d[0], xzr + mov v4.d[0], xzr + mov v6.d[0], xzr + mov v8.d[0], xzr + + mov v1.d[0], xzr + mov v3.d[0], xzr + mov v5.d[0], xzr + mov v7.d[0], xzr + mov v9.d[0], xzr + +// Initialize (X3,Z3) = (X,1), projective representation of X + + mov v10.d[0], x10 + mov v12.d[0], x11 + mov v14.d[0], x12 + mov v16.d[0], x13 + mov v18.d[0], x14 + + mov v11.d[0], x1 + mov v13.d[0], xzr + mov v15.d[0], xzr + mov v17.d[0], xzr + mov v19.d[0], xzr + +// Set up some constants used repeatedly in the main loop: +// +// Q31 = 0x1300000013 (two 32-bit copies of 19) +// Q30 = 0x3ffffff0000000003ffffff (two 64-bit copies of 2^26-1) +// Q29 = mask1 = (0x07ffffc,0x07fffffe) +// Q28 = mask2 = (0x07ffffb4,0x07fffffe) -// Initialize with explicit doubling in order to handle set bit 254. -// Set swap = 1 and (xm,zm) = (x,1) then double as (xn,zn) = 2 * (x,1). -// We use the fact that the point x coordinate is still in registers. -// Since zm = 1 we could do the doubling with an operation count of -// 2 * S + M instead of 2 * S + 2 * M, but it doesn't seem worth -// the slight complication arising from a different linear combination. - - mov swap, #1 - stp x10, x11, [xm] - stp x12, x13, [xm+16] - stp swap, xzr, [zm] - stp xzr, xzr, [zm+16] - - sub_twice4(d,xm,zm) - add_twice4(s,xm,zm) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - -// The main loop over unmodified bits from i = 253, ..., i = 3 (inclusive). -// This is a classic Montgomery ladder, with the main coordinates only -// reduced mod 2 * p_25519, some intermediate results even more loosely. - - mov i, #253 - -curve25519_x25519_byte_scalarloop: - -// sm = xm + zm; sn = xn + zn; dm = xm - zm; dn = xn - zn - - sub_twice4(dm,xm,zm) - add_twice4(sn,xn,zn) - sub_twice4(dn,xn,zn) - add_twice4(sm,xm,zm) - -// ADDING: dmsn = dm * sn -// DOUBLING: mux d = xt - zt and s = xt + zt for appropriate choice of (xt,zt) - - mul_4(dmsn,sn,dm) - - lsr x0, i, #6 - ldr x2, [sp, x0, lsl #3] // Exploiting scalar = sp exactly - lsr x2, x2, i - and x2, x2, #1 - - cmp swap, x2 - mov swap, x2 - - mux_4(d,dm,dn) - mux_4(s,sm,sn) - -// ADDING: dnsm = sm * dn - - mul_4(dnsm,sm,dn) - -// DOUBLING: d = (xt - zt)^2 - - sqr_4(d,d) - -// ADDING: dpro = (dmsn - dnsm)^2, spro = (dmsn + dnsm)^2 -// DOUBLING: s = (xt + zt)^2 - - sub_twice4(dpro,dmsn,dnsm) - sqr_4(s,s) - add_twice4(spro,dmsn,dnsm) - sqr_4(dpro,dpro) - -// DOUBLING: p = 4 * xt * zt = s - d - - sub_twice4(p,s,d) - -// ADDING: xm' = (dmsn + dnsm)^2 - - sqr_4(xm,spro) - -// DOUBLING: e = 121666 * p + d - - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) + mov w0, #19 + add x0, x0, x0, lsl #32 + mov v31.d[0], x0 + mov v31.d[1], xzr -// DOUBLING: xn' = (xt + zt)^2 * (xt - zt)^2 = s * d + mov x0, #(1<<26)-1 + mov v30.d[0], x0 + mov v30.d[1], x0 - mul_4(xn,s,d) + mov x0, #0x07fffffe07fffffe + sub x1, x0, #0xfe-0xb4 + sub x0, x0, #2 -// ADDING: zm' = x * (dmsn - dnsm)^2 + stp x0, x1, [mask1] + ldp d29, d28, [mask1] - mul_4(zm,dpro,pointx) +// The main loop over (modified) bits from i = 254, ..., i = 0 (inclusive); +// we explicitly skip bit 255 because it should be forced to zero initially. +// This is a classic Montgomery ladder using a "swap" variable. +// It's assumed x0 = i at the start of the loop, but that is volatile and +// needs to be reloaded from memory at the end of the loop. -// DOUBLING: zn' = (4 * xt * zt) * ((xt - zt)^2 + 121666 * (4 * xt * zt)) -// = p * (d + 121666 * p) + str xzr, [swap] + mov x0, #254 + str x0, [i] - mul_4(zn,p,e) +curve25519_x25519_byte_scalarloop: -// Loop down as far as 3 (inclusive) + lsr x1, x0, #6 + ldr x2, [sp, x1, lsl #3] // Exploiting scalar = sp exactly + lsr x2, x2, x0 + and x2, x2, #1 - sub i, i, #1 - cmp i, #3 + ldr x0, [swap] + cmp x0, x2 + str x2, [swap] + +// The following inner loop code is derived closely following Lenngren's +// implementation available at "https://github.com/Emill/X25519-AArch64". +// In particular, the basic dataflow and the organization between integer +// and SIMD units is identical, with only a few minor changes to some +// individual instructions (for miscellaneous reasons). The scheduling +// was redone from scratch by SLOTHY starting from Hanno Becker's +// un-interleaved form and using the same scripts as in Becker et al's +// paper. +// +// The intermediate value annotations were added to provide data that +// is used in the formal proof, indicating which lines assign specific +// digits of the various intermediate results (mainly of field +// operations, sometimes other transformations). The names used for +// the intermediate results are similar but not identical to those in +// the abstract Algorithm 1 description in Lenngren's paper. Almost +// all equations are to be interpreted as field operations, i.e. as +// arithmetic modulo 2^255-19, not simple numeric equalities. +// +// b = x2 - z2 +// d = x3 - z3 +// a = x2 + z2 +// c = x3 + z3 +// f = if flip then c else a +// g = if flip then d else b +// aa = f^2 +// bb = g^2 +// bbalt = bb (change of representation) +// e = aa - bb +// bce = bbalt + 121666 * e +// z4 = bce * e +// bc = b * c +// ad = a * d +// t1 = ad + bc +// t2 = ad - bc +// x5 = t1^2 +// t3 = t2^2 +// x4 = aa * bb +// z5 = x * t3 +// +// Then the main variables are updated for the next iteration as +// +// (x2',z2') = (x4,z4) +// (x3',z3') = (x5,z5) + + add v22.2S, v2.2S, v3.2S // ubignum_of_qreglist 1 // INTERMEDIATE a + sub v21.2S, v28.2S, v1.2S + add v25.2S, v0.2S, v1.2S // ubignum_of_qreglist 0 // INTERMEDIATE a + sub v24.2S, v29.2S, v3.2S + add v3.2S, v18.2S, v19.2S // ubignum_of_qreglist 4 // INTERMEDIATE c + add v0.2S, v0.2S, v21.2S // ubignum_of_qreglist 0 // INTERMEDIATE b + sub v20.2S, v29.2S, v15.2S + sub v1.2S, v29.2S, v5.2S + sub v26.2S, v28.2S, v11.2S + sub v21.2S, v29.2S, v19.2S + add v19.2S, v10.2S, v11.2S // ubignum_of_qreglist 0 // INTERMEDIATE c + add v11.2S, v14.2S, v20.2S // ubignum_of_qreglist 2 // INTERMEDIATE d + add v21.2S, v18.2S, v21.2S // ubignum_of_qreglist 4 // INTERMEDIATE d + sub v20.2S, v29.2S, v17.2S + add v18.2S, v2.2S, v24.2S // ubignum_of_qreglist 1 // INTERMEDIATE b + add v14.2S, v14.2S, v15.2S // ubignum_of_qreglist 2 // INTERMEDIATE c + add v15.2S, v16.2S, v17.2S // ubignum_of_qreglist 3 // INTERMEDIATE c + add v2.2S, v16.2S, v20.2S // ubignum_of_qreglist 3 // INTERMEDIATE d + add v24.2S, v12.2S, v13.2S // ubignum_of_qreglist 1 // INTERMEDIATE c + add v26.2S, v10.2S, v26.2S // ubignum_of_qreglist 0 // INTERMEDIATE d + sub v10.2S, v29.2S, v13.2S + sub v13.2S, v29.2S, v7.2S + add v23.2S, v6.2S, v7.2S // ubignum_of_qreglist 3 // INTERMEDIATE a + sub v7.2S, v29.2S, v9.2S + add v27.2S, v12.2S, v10.2S // ubignum_of_qreglist 1 // INTERMEDIATE d + fcsel d20, d22, d24, eq // ubignum_of_qreglist 1 // INTERMEDIATE f + add v28.2S, v4.2S, v5.2S // ubignum_of_qreglist 2 // INTERMEDIATE a + fcsel d12, d23, d15, eq // ubignum_of_qreglist 3 // INTERMEDIATE f + add v7.2S, v8.2S, v7.2S // ubignum_of_qreglist 4 // INTERMEDIATE b + fcsel d16, d25, d19, eq // ubignum_of_qreglist 0 // INTERMEDIATE f + mov x0, v20.d[0] + fcsel d5, d28, d14, eq // ubignum_of_qreglist 2 // INTERMEDIATE f + mov x21, v12.d[0] + fcsel d29, d7, d21, eq // ubignum_of_qreglist 4 // INTERMEDIATE g + mov x5, v16.d[0] + lsr x26, x0, #32 + add x29, x21, x21 + umull x15, w5, w29 + add v13.2S, v6.2S, v13.2S // ubignum_of_qreglist 3 // INTERMEDIATE b + add x12, x26, x26 + mov x30, v5.d[0] + fcsel d10, d18, d27, eq // ubignum_of_qreglist 1 // INTERMEDIATE g + lsr x11, x5, #32 + lsr x10, x30, #32 + trn2 v20.2S, v21.2S, v3.2S + add v9.2S, v8.2S, v9.2S // ubignum_of_qreglist 4 // INTERMEDIATE a + add x14, x11, x11 + trn2 v6.2S, v2.2S, v15.2S + trn1 v12.2S, v25.2S, v0.2S + add v1.2S, v4.2S, v1.2S // ubignum_of_qreglist 2 // INTERMEDIATE b + trn1 v16.2S, v23.2S, v13.2S + fcsel d8, d13, d2, eq // ubignum_of_qreglist 3 // INTERMEDIATE g + trn2 v17.2S, v27.2S, v24.2S + str d29, [tmpb+32] + add x17, x10, x10 + trn2 v4.2S, v28.2S, v1.2S + trn1 v5.2S, v28.2S, v1.2S + trn1 v28.2S, v2.2S, v15.2S + trn1 v2.2S, v22.2S, v18.2S + fcsel d29, d0, d26, eq // ubignum_of_qreglist 0 // INTERMEDIATE g + trn2 v15.2S, v22.2S, v18.2S + umull v22.2D, v12.2S, v20.2S + umull x22, w30, w17 + stp d29, d10, [tmpb+0] + trn2 v10.2S, v23.2S, v13.2S + trn2 v23.2S, v11.2S, v14.2S + trn1 v13.2S, v27.2S, v24.2S + fcsel d27, d1, d11, eq // ubignum_of_qreglist 2 // INTERMEDIATE g + trn1 v14.2S, v11.2S, v14.2S + umlal v22.2D, v2.2S, v6.2S + umull x25, w30, w30 + umlal v22.2D, v5.2S, v23.2S + add x3, x30, x30 + umlal v22.2D, v16.2S, v17.2S + add w30, w21, w21, lsl #1; + stp d27, d8, [tmpb+16] + add w30, w30, w21, lsl #4 + trn1 v11.2S, v26.2S, v19.2S + trn2 v8.2S, v26.2S, v19.2S + trn2 v19.2S, v25.2S, v0.2S + mul v29.2S, v20.2S, v31.2S + ldr x20, [tmpb+24] + umull v25.2D, v19.2S, v6.2S + add x1, x0, x0 + umull v27.2D, v19.2S, v23.2S + umull x9, w5, w1 + umull v0.2D, v12.2S, v23.2S + lsr x24, x20, #32 + mul v20.2S, v23.2S, v31.2S + lsr x16, x21, #32 + umlal v25.2D, v15.2S, v23.2S + umaddl x13, w11, w14, x9 + umlal v25.2D, v4.2S, v17.2S + umaddl x9, w14, w17, x15 + umull v24.2D, v12.2S, v6.2S + add w2, w16, w16, lsl #1; + fcsel d26, d9, d3, eq // ubignum_of_qreglist 4 // INTERMEDIATE f + add w2, w2, w16, lsl #4 + trn1 v18.2S, v21.2S, v3.2S + umull v3.2D, v19.2S, v29.2S + umull x28, w5, w3 + mul v1.2S, v6.2S, v31.2S + umull x8, w5, w5 + umlal v24.2D, v2.2S, v23.2S + umaddl x13, w21, w30, x13 + mul v23.2S, v17.2S, v31.2S + umaddl x27, w14, w12, x28 + trn2 v6.2S, v9.2S, v7.2S + mov x6, v26.d[0] + umlal v3.2D, v15.2S, v1.2S + add x16, x16, x16 + umlal v3.2D, v4.2S, v20.2S + lsr x4, x6, #32 + umlal v3.2D, v10.2S, v23.2S + add x7, x6, x6 + umull v26.2D, v19.2S, v8.2S + add x23, x4, x4 + umaddl x28, w5, w23, x22 + trn1 v7.2S, v9.2S, v7.2S + umlal v27.2D, v15.2S, v17.2S + add w15, w4, w4, lsl #1; + umlal v27.2D, v4.2S, v8.2S + add w15, w15, w4, lsl #4 + add w22, w10, w10, lsl #1; + umlal v24.2D, v5.2S, v17.2S + add w22, w22, w10, lsl #4 + umaddl x10, w11, w7, x28 + umlal v25.2D, v10.2S, v8.2S + umull x21, w5, w16 + umlal v25.2D, v6.2S, v29.2S + umaddl x23, w15, w23, x25 + umlal v27.2D, v10.2S, v29.2S + umull x19, w5, w12 + umlal v27.2D, v6.2S, v1.2S + umaddl x25, w11, w29, x21 + umlal v0.2D, v2.2S, v17.2S + umaddl x28, w0, w3, x9 + shl v21.2D, v25.2D, #1 + umaddl x4, w11, w1, x19 + umaddl x21, w2, w29, x4 + mul v25.2S, v8.2S, v31.2S + umlal v24.2D, v16.2S, v8.2S + umaddl x19, w0, w17, x25 + umlal v24.2D, v7.2S, v29.2S + umull x25, w5, w17 + umlal v24.2D, v19.2S, v28.2S + umaddl x4, w0, w16, x10 + umull v9.2D, v12.2S, v8.2S + umaddl x23, w5, w7, x23 + umlal v21.2D, v12.2S, v18.2S + add w10, w6, w6, lsl #1; + shl v27.2D, v27.2D, #1 + add w10, w10, w6, lsl #4 + umaddl x28, w26, w12, x28 + umlal v26.2D, v15.2S, v29.2S + umaddl x9, w14, w16, x23 + umlal v9.2D, v2.2S, v29.2S + umaddl x22, w22, w17, x8 + umlal v21.2D, v2.2S, v28.2S + umaddl x28, w6, w10, x28 + umaddl x27, w0, w0, x27 + add x8, x14, x14 + umlal v0.2D, v5.2S, v8.2S + umull x5, w5, w14 + umlal v9.2D, v5.2S, v1.2S + umaddl x14, w0, w29, x9 + umlal v26.2D, v4.2S, v1.2S + umaddl x6, w2, w16, x27 + umlal v22.2D, v7.2S, v8.2S + umaddl x5, w30, w17, x5 + umaddl x5, w2, w3, x5 + add x23, x17, x17 + umlal v27.2D, v12.2S, v28.2S + umaddl x13, w2, w23, x13 + umlal v26.2D, v10.2S, v20.2S + add x9, x12, x12 + umlal v9.2D, v16.2S, v20.2S + umaddl x27, w10, w29, x6 + umlal v0.2D, v16.2S, v29.2S + umaddl x6, w11, w3, x25 + umlal v22.2D, v19.2S, v18.2S + umaddl x19, w26, w3, x19 + mul v18.2S, v18.2S, v31.2S + umaddl x23, w15, w23, x27 + umlal v3.2D, v6.2S, v25.2S + umaddl x0, w0, w12, x6 + umlal v0.2D, v7.2S, v1.2S + add x11, x16, x16 + umlal v9.2D, v7.2S, v23.2S + umaddl x6, w12, w17, x14 + umlal v9.2D, v19.2S, v11.2S + umaddl x25, w26, w29, x4 + umlal v9.2D, v15.2S, v18.2S + umaddl x14, w10, w3, x13 + umull v25.2D, v12.2S, v17.2S + umaddl x27, w10, w16, x0 + umlal v26.2D, v6.2S, v23.2S + add x0, x25, x6, lsr #26 + mul v23.2S, v28.2S, v31.2S + umaddl x12, w10, w12, x5 + shl v3.2D, v3.2D, #1 + add x16, x22, x0, lsr #25 + umlal v21.2D, v5.2S, v14.2S + bic x22, x0, #0x1ffffff + umlal v3.2D, v12.2S, v11.2S + add x26, x16, x22, lsr #24 + umlal v3.2D, v2.2S, v18.2S + umaddl x16, w10, w17, x21 + umlal v3.2D, v5.2S, v23.2S + add x22, x26, x22, lsr #21 + umlal v9.2D, v4.2S, v23.2S + umaddl x5, w15, w29, x27 + umull v17.2D, v19.2S, v17.2S + umaddl x17, w30, w3, x22 + umlal v25.2D, v2.2S, v8.2S + umaddl x25, w15, w3, x16 + umlal v25.2D, v5.2S, v29.2S + umaddl x26, w15, w7, x19 + umlal v0.2D, v19.2S, v14.2S + umaddl x17, w2, w9, x17 + umlal v17.2D, v15.2S, v8.2S + ldr x19, [tmpb+0] + umlal v17.2D, v4.2S, v29.2S + ldr x7, [tmpb+8] + shl v29.2D, v26.2D, #1 + umaddl x13, w10, w1, x17 + umlal v0.2D, v15.2S, v13.2S + lsr x2, x19, #32 + umlal v29.2D, v12.2S, v13.2S + umaddl x27, w15, w1, x12 + umlal v29.2D, v2.2S, v11.2S + umaddl x30, w15, w8, x13 + umlal v29.2D, v5.2S, v18.2S + add x4, x7, x7 + umlal v29.2D, v16.2S, v23.2S + umaddl x29, w15, w9, x14 + umlal v0.2D, v4.2S, v11.2S + add x17, x27, x30, lsr #26 + umlal v0.2D, v10.2S, v18.2S + umaddl x16, w15, w11, x28 + umlal v0.2D, v6.2S, v23.2S + add x1, x29, x17, lsr #25 + umlal v25.2D, v16.2S, v1.2S + umull x11, w19, w4 + ldr x8, [tmpb+32] + mul v26.2S, v14.2S, v31.2S + umlal v17.2D, v10.2S, v1.2S + ldr x15, [tmpb+16] + umlal v17.2D, v6.2S, v20.2S + and x9, x30, #0x3ffffff + bfi x9, x17, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE aa + add x17, x2, x2 + lsr x10, x15, #32 + add x27, x25, x1, lsr #26 + umlal v25.2D, v7.2S, v20.2S + add x13, x10, x10 + umlal v25.2D, v19.2S, v13.2S + add x29, x23, x27, lsr #25 + umlal v25.2D, v15.2S, v11.2S + lsr x30, x8, #32 + umlal v25.2D, v4.2S, v18.2S + add x23, x5, x29, lsr #26 + umlal v25.2D, v10.2S, v23.2S + and x14, x29, #0x3ffffff + umlal v25.2D, v6.2S, v26.2S + add x5, x16, x23, lsr #25 + shl v8.2D, v17.2D, #1 + umaddl x12, w2, w17, x11 + and x29, x5, #0x3ffffff + umull x21, w19, w19 + umlal v29.2D, v7.2S, v26.2S + add w16, w10, w10, lsl #1; + umlal v3.2D, v16.2S, v26.2S + add w16, w16, w10, lsl #4 + bfi x14, x23, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE aa + add w10, w24, w24, lsl #1; + add x22, x26, x5, lsr #26 + add w10, w10, w24, lsl #4 + umlal v8.2D, v12.2S, v14.2S + umaddl x25, w16, w13, x21 + umlal v8.2D, v2.2S, v13.2S + bfi x29, x22, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE aa + umlal v8.2D, v5.2S, v11.2S + add x26, x24, x24 + umlal v8.2D, v16.2S, v18.2S + stp x14, x29, [tmpa+16] + umlal v8.2D, v7.2S, v23.2S + add w24, w30, w30, lsl #1; + usra v25.2D, v29.2D, #26 + add w24, w24, w30, lsl #4 + umull x29, w15, w15 + umlal v27.2D, v2.2S, v14.2S + umull x3, w15, w13 + umlal v27.2D, v5.2S, v13.2S + add x21, x20, x20 + umlal v24.2D, v15.2S, v14.2S + umull x5, w19, w21 + umlal v24.2D, v4.2S, v13.2S + and x11, x1, #0x3ffffff + usra v8.2D, v25.2D, #25 + and x1, x0, #0x1ffffff + umlal v27.2D, v16.2S, v11.2S + umaddl x23, w17, w13, x5 + umlal v27.2D, v7.2S, v18.2S + add x5, x30, x30 + usra v0.2D, v8.2D, #26 + add x0, x15, x15 + umlal v24.2D, v10.2S, v11.2S + umaddl x23, w7, w0, x23 + umlal v24.2D, v6.2S, v18.2S + lsr x30, x7, #32 + usra v27.2D, v0.2D, #25 + add x16, x30, x30 + and v20.16B, v8.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = bc|ad + umaddl x15, w30, w16, x23 + ushr v23.2D, v30.2D, #1 + add w23, w8, w8, lsl #1; + usra v24.2D, v27.2D, #26 + add w23, w23, w8, lsl #4 + umaddl x14, w19, w5, x3 + and v8.16B, v27.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = bc|ad + add x28, x8, x8 + and v27.16B, v0.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = bc|ad + umaddl x8, w8, w23, x15 + and v5.16B, v24.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = bc|ad + umaddl x3, w2, w28, x14 + umlal v22.2D, v15.2S, v28.2S + bfi x11, x27, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE aa + uzp1 v5.4S, v8.4S, v5.4S + umaddl x14, w24, w5, x29 + umaddl x5, w19, w28, x14 + ldr d18, [mask1] + mov v18.d[1], v18.d[0] + umaddl x15, w7, w26, x3 + mul v12.2S, v13.2S, v31.2S + umlal v21.2D, v16.2S, v13.2S + stp x9, x11, [tmpa+0] + umlal v21.2D, v7.2S, v11.2S + umaddl x29, w17, w26, x5 + umlal v22.2D, v4.2S, v14.2S + add w14, w20, w20, lsl #1; + umlal v22.2D, v10.2S, v13.2S + add w14, w14, w20, lsl #4 + umull x3, w19, w0 + umlal v22.2D, v6.2S, v11.2S + umaddl x29, w7, w21, x29 + usra v21.2D, v24.2D, #25 + umaddl x11, w20, w14, x12 + and v0.16B, v25.16B, v23.16B + umaddl x5, w30, w21, x15 + and v14.16B, v29.16B, v30.16B + umaddl x12, w16, w13, x29 + usra v22.2D, v21.2D, #26 + umaddl x29, w17, w16, x3 + umlal v3.2D, v7.2S, v12.2S + add x9, x26, x26 + and v1.16B, v21.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = bc|ad + add x27, x5, x12, lsr #26 + bic v8.16B, v22.16B, v23.16B + umaddl x29, w7, w7, x29 + and v17.16B, v22.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = bc|ad + add x5, x25, x27, lsr #25 + usra v3.2D, v8.2D, #25 + umaddl x25, w24, w9, x8 + umlal v9.2D, v10.2S, v26.2S + add x8, x13, x13 + trn1 v22.4S, v1.4S, v17.4S + umaddl x11, w10, w8, x11 + usra v3.2D, v8.2D, #24 + umull x20, w19, w16 + add v26.2S, v22.2S, v18.2S + ldr d28, [mask2] + umlal v9.2D, v6.2S, v12.2S + umaddl x3, w23, w0, x11 + usra v3.2D, v8.2D, #21 + umaddl x29, w10, w26, x29 + uzp1 v11.4S, v20.4S, v27.4S + umaddl x20, w2, w4, x20 + umaddl x9, w10, w21, x20 + mov v17.d[0], v22.d[1] + usra v9.2D, v3.2D, #26 + umull x15, w19, w13 + and v7.16B, v3.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = bc|ad + add x11, x16, x16 + uzp2 v1.4S, v11.4S, v5.4S + umaddl x20, w23, w13, x9 + and v8.16B, v9.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = bc|ad + umaddl x9, w2, w0, x15 + usra v14.2D, v9.2D, #25 + and x6, x6, #0x3ffffff + uzp1 v7.4S, v7.4S, v8.4S + umaddl x29, w23, w21, x29 + uzp1 v27.4S, v11.4S, v5.4S + umull x15, w19, w26 + usra v0.2D, v14.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = bc|ad + add x6, x6, x22, lsr #25 + and v3.16B, v14.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = bc|ad + bic x22, x27, #0x1ffffff + sub v2.2S, v26.2S, v17.2S + add v9.2S, v22.2S, v17.2S + uzp1 v14.4S, v3.4S, v0.4S + umaddl x2, w2, w21, x15 + add v5.4S, v27.4S, v18.4S + add x5, x5, x22, lsr #24 + zip1 v22.2S, v2.2S, v9.2S // ubignum_of_h32reglist 8 + ubignum_of_l32reglist 8 // INTERMEDIATE H|L = t1|t2 + mov v18.b[0], v28.b[0] + uzp1 v8.4S, v7.4S, v14.4S + add x22, x5, x22, lsr #21 + uzp2 v3.4S, v7.4S, v14.4S + umaddl x5, w7, w16, x9 + add v25.4S, v8.4S, v18.4S + umaddl x15, w14, w0, x22 + add v12.4S, v27.4S, v1.4S + add x9, x17, x17 + sub v14.4S, v5.4S, v1.4S + umull x19, w19, w17 + sub v18.4S, v25.4S, v3.4S + ldr x22, [tmpa+8] + add v20.4S, v8.4S, v3.4S + umaddl x15, w10, w11, x15 + zip1 v16.4S, v14.4S, v12.4S // ubignum_of_h32reglist 4 + ubignum_of_l32reglist 4 // INTERMEDIATE H|L = t1|t2 + umaddl x14, w14, w13, x19 + zip2 v14.4S, v14.4S, v12.4S // ubignum_of_h32reglist 6 + ubignum_of_l32reglist 6 // INTERMEDIATE H|L = t1|t2 + and x17, x27, #0x1ffffff + zip2 v0.4S, v18.4S, v20.4S // ubignum_of_h32reglist 2 + ubignum_of_l32reglist 2 // INTERMEDIATE H|L = t1|t2 + umaddl x15, w23, w4, x15 + zip1 v1.4S, v18.4S, v20.4S // ubignum_of_h32reglist 0 + ubignum_of_l32reglist 0 // INTERMEDIATE H|L = t1|t2 + umaddl x10, w10, w0, x14 + zip2 v5.2S, v2.2S, v9.2S // ubignum_of_h32reglist 9 + ubignum_of_l32reglist 9 // INTERMEDIATE H|L = t1|t2 + shl v24.2S, v0.2S, #1 + mov v19.d[0], v1.d[1] // ubignum_of_h32reglist 1 + ubignum_of_l32reglist 1 // INTERMEDIATE H|L = t1|t2 + shl v26.2S, v22.2S, #1 + shl v17.2S, v16.2S, #1 + mov v15.d[0], v0.d[1] // ubignum_of_h32reglist 3 + ubignum_of_l32reglist 3 // INTERMEDIATE H|L = t1|t2 + shl v7.2S, v5.2S, #1 + shl v18.2S, v19.2S, #1 + umull v11.2D, v1.2S, v24.2S + umaddl x19, w23, w16, x10 + umull v6.2D, v1.2S, v17.2S + umaddl x10, w7, w13, x2 + mov v4.d[0], v16.d[1] // ubignum_of_h32reglist 5 + ubignum_of_l32reglist 5 // INTERMEDIATE H|L = t1|t2 + mov v10.d[0], v14.d[1] // ubignum_of_h32reglist 7 + ubignum_of_l32reglist 7 // INTERMEDIATE H|L = t1|t2 + umull v9.2D, v1.2S, v26.2S + ldr x13, [tmpa+0] + shl v28.2S, v15.2S, #1 + shl v3.2S, v10.2S, #1 + ldr x14, [tmpa+16] + mul v12.2S, v10.2S, v31.2S + umull v25.2D, v1.2S, v7.2S + ldr x2, [tmpa+24] + umlal v6.2D, v18.2S, v28.2S + umaddl x27, w30, w0, x10 + umaddl x16, w24, w0, x20 + shl v13.2S, v14.2S, #1 + umaddl x5, w23, w26, x5 + mul v2.2S, v22.2S, v31.2S + umull v21.2D, v1.2S, v13.2S + umaddl x23, w24, w8, x29 + umlal v11.2D, v18.2S, v19.2S + mov x10, #0x07fffffe07fffffe + sub x10, x10, #2 + umaddl x26, w24, w21, x5 + mul v29.2S, v14.2S, v31.2S + umlal v25.2D, v19.2S, v26.2S + add x7, x1, x6, lsr #26 + mul v20.2S, v4.2S, v31.2S + and x6, x6, #0x3ffffff + shl v8.2S, v18.2S, #1 + shl v4.2S, v4.2S, #1 + umlal v11.2D, v29.2S, v14.2S + bfi x6, x7, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE aa + umlal v25.2D, v0.2S, v3.2S + umaddl x0, w24, w4, x19 + umlal v25.2D, v15.2S, v13.2S + str x6, [tmpa+32] + umlal v21.2D, v18.2S, v4.2S + umaddl x8, w24, w11, x3 + umlal v21.2D, v0.2S, v17.2S + ldr x30, [tmpa+32] + mul v14.2S, v5.2S, v31.2S + add x2, x2, x10 + shl v5.2S, v28.2S, #1 + shl v27.2S, v4.2S, #1 + umlal v6.2D, v0.2S, v0.2S + umaddl x11, w24, w9, x15 + umlal v6.2D, v12.2S, v3.2S + add x4, x30, x10 + umlal v11.2D, v14.2S, v5.2S + add x3, x22, x10 + umlal v11.2D, v2.2S, v17.2S + add x6, x0, x11, lsr #26 + umlal v11.2D, v12.2S, v27.2S + add x14, x14, x10 + umlal v6.2D, v14.2S, v27.2S + add x8, x8, x6, lsr #25 + umlal v6.2D, v2.2S, v13.2S + movk x10, #0xffb4 + umlal v25.2D, v16.2S, v4.2S + add x29, x16, x8, lsr #26 + umull v27.2D, v1.2S, v3.2S + and x11, x11, #0x3ffffff + umlal v9.2D, v18.2S, v3.2S + add x19, x13, x10 + umlal v9.2D, v0.2S, v13.2S + and x5, x8, #0x3ffffff + umlal v9.2D, v28.2S, v4.2S + bfi x11, x6, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE bb + umlal v9.2D, v16.2S, v16.2S + umaddl x30, w24, w28, x27 + umlal v9.2D, v14.2S, v7.2S + sub x13, x19, x11 + umull v10.2D, v1.2S, v18.2S + add x7, x23, x29, lsr #25 + umlal v21.2D, v28.2S, v15.2S + lsr x16, x13, #32 // ubignum_of_wreglist 1 + ubignum_of_wreglist 0 // INTERMEDIATE e + umlal v21.2D, v2.2S, v22.2S + add x0, x26, x7, lsr #26 + usra v25.2D, v9.2D, #26 + and x20, x7, #0x3ffffff + umull v22.2D, v1.2S, v1.2S + add x8, x25, x0, lsr #25 + umull v7.2D, v1.2S, v28.2S + and x1, x29, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bbalt + bic v18.16B, v25.16B, v23.16B + and x19, x8, #0x3ffffff + and v16.16B, v9.16B, v30.16B + and x7, x12, #0x3ffffff + usra v22.2D, v18.2D, #25 + add x10, x30, x8, lsr #26 + umlal v7.2D, v19.2S, v24.2S + bfi x5, x29, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE bb + and v9.16B, v25.16B, v23.16B + add x27, x7, x10, lsr #25 + usra v22.2D, v18.2D, #24 + mov x21, #60833 + lsl x21, x21, #1 + add x15, x17, x27, lsr #26 + shl v25.2S, v3.2S, #1 + umlal v7.2D, v14.2S, v17.2S + and x29, x27, #0x3ffffff + usra v22.2D, v18.2D, #21 + bfi x29, x15, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE bb // ***SOURCE*** ubignum_of_xreglist 9 // INTERMEDIATE bbalt + umlal v10.2D, v14.2S, v24.2S + and x17, x6, #0x1ffffff // ubignum_of_xreglist 1 // INTERMEDIATE bbalt + umlal v10.2D, v2.2S, v28.2S + sub x6, x3, x5 + umlal v10.2D, v12.2S, v17.2S + umaddl x25, w16, w21, x17 + umlal v10.2D, v29.2S, v4.2S + mov w12, w5 // ubignum_of_xreglist 2 // INTERMEDIATE bbalt + umlal v22.2D, v20.2S, v4.2S + lsr x26, x6, #32 // ubignum_of_wreglist 3 + ubignum_of_wreglist 2 // INTERMEDIATE e + umlal v22.2D, v14.2S, v8.2S + and x24, x0, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bbalt + umlal v22.2D, v2.2S, v24.2S + stp x11, x5, [tmpb+0] + umlal v22.2D, v12.2S, v5.2S + bfi x20, x0, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE bb + umlal v22.2D, v29.2S, v17.2S + umaddl x12, w6, w21, x12 + umull v18.2D, v1.2S, v4.2S + bfi x19, x10, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE bb + umlal v7.2D, v2.2S, v4.2S + sub x7, x14, x20 + umlal v27.2D, v19.2S, v13.2S + mov w8, w20 // ubignum_of_xreglist 4 // INTERMEDIATE bbalt + usra v10.2D, v22.2D, #26 + lsr x14, x7, #32 // ubignum_of_wreglist 5 + ubignum_of_wreglist 4 // INTERMEDIATE e + umlal v18.2D, v19.2S, v17.2S + and x28, x10, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bbalt + umlal v7.2D, v12.2S, v13.2S + sub x5, x2, x19 + usra v11.2D, v10.2D, #25 + mov w2, w19 // ubignum_of_xreglist 6 // INTERMEDIATE bbalt + umlal v27.2D, v0.2S, v4.2S + umlal v21.2D, v14.2S, v25.2S + sub x23, x4, x29 + usra v7.2D, v11.2D, #26 + mov w0, w29 // ubignum_of_xreglist 8 // INTERMEDIATE bbalt + umlal v18.2D, v0.2S, v28.2S + lsr x22, x23, #32 // ubignum_of_wreglist 9 + ubignum_of_wreglist 8 // INTERMEDIATE e + umlal v27.2D, v15.2S, v17.2S + str x29, [tmpb+32] + usra v6.2D, v7.2D, #25 + mov w17, w11 // ubignum_of_xreglist 0 // INTERMEDIATE bbalt + and v0.16B, v22.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x5|t3 + umaddl x27, w26, w21, x1 + umlal v18.2D, v14.2S, v13.2S + umaddl x30, w23, w21, x0 + umlal v18.2D, v2.2S, v3.2S + lsr x10, x5, #32 // ubignum_of_wreglist 7 + ubignum_of_wreglist 6 // INTERMEDIATE e + and v4.16B, v6.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x5|t3 + and v1.16B, v10.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x5|t3 + umaddl x4, w14, w21, x24 + ldr x0, [tmpa+0] + mov v0.s[1], w0 + lsr x0, x0, #32 + mov v1.s[1], w0 + umaddl x9, w7, w21, x8 + usra v18.2D, v6.2D, #26 + umaddl x24, w10, w21, x28 + and v3.16B, v7.16B, v23.16B // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x5|t3 + umaddl x8, w22, w21, x15 + umlal v27.2D, v14.2S, v26.2S + umaddl x15, w13, w21, x17 + usra v21.2D, v18.2D, #25 + stp x20, x19, [tmpb+16] + and v2.16B, v11.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x5|t3 + lsr x29, x8, #25 + ldr x3, [tmpb+0] + mov v10.s[1], w3 + lsr x3, x3, #32 + mov v11.s[1], w3 + add x17, x15, x29 + usra v27.2D, v21.2D, #26 + add x28, x17, x29, lsl #1 + and v6.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x5|t3 + and x20, x8, #0x1ffffff + and v5.16B, v18.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x5|t3 + add x17, x28, x29, lsl #4 + and v7.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x5|t3 + ldr x3, [tmpb+8] + mov v22.s[1], w3 + lsr x3, x3, #32 + mov v23.s[1], w3 + add x29, x25, x17, lsr #26 + ldr x15, [pointx+0] + mov v10.s[0], w15 + lsr x15, x15, #32 + mov v11.s[0], w15 + and x11, x17, #0x3ffffff // ubignum_of_xreglist 0 // INTERMEDIATE bce + usra v16.2D, v27.2D, #25 + add x8, x12, x29, lsr #25 + ldr x3, [tmpb+16] + mov v14.s[1], w3 + lsr x3, x3, #32 + mov v15.s[1], w3 + and x12, x29, #0x1ffffff // ubignum_of_xreglist 1 // INTERMEDIATE bce + ldr x15, [pointx+8] + mov v22.s[0], w15 + lsr x15, x15, #32 + mov v23.s[0], w15 + add x28, x27, x8, lsr #26 + and v8.16B, v16.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + umull x1, w12, w10 + ldr x3, [tmpb+24] + mov v17.s[1], w3 + lsr x3, x3, #32 + mov v18.s[1], w3 + add x25, x9, x28, lsr #25 + ldr x15, [pointx+16] + mov v14.s[0], w15 + lsr x15, x15, #32 + mov v15.s[0], w15 + umaddl x19, w5, w21, x2 + usra v9.2D, v16.2D, #26 // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + add x2, x4, x25, lsr #26 + ldr x3, [tmpb+32] + mov v24.s[1], w3 + lsr x3, x3, #32 + mov v25.s[1], w3 + umull x3, w12, w23 + ldr x15, [pointx+24] + mov v17.s[0], w15 + lsr x15, x15, #32 + mov v18.s[0], w15 + add x29, x19, x2, lsr #25 + umull v26.2D, v0.2S, v23.2S + and x21, x28, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bce + ldr x0, [tmpa+8] + mov v2.s[1], w0 + lsr x0, x0, #32 + mov v3.s[1], w0 + umaddl x27, w21, w5, x3 + ldr x15, [pointx+32] + mov v24.s[0], w15 + lsr x15, x15, #32 + mov v25.s[0], w15 + add x17, x24, x29, lsr #26 + umull v29.2D, v1.2S, v18.2S + and x15, x8, #0x3ffffff // ubignum_of_xreglist 2 // INTERMEDIATE bce + umull v20.2D, v0.2S, v15.2S + add x19, x30, x17, lsr #25 + and x3, x17, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bce + mul v12.2S, v25.2S, v31.2S + ldr x0, [tmpa+16] + mov v4.s[1], w0 + lsr x0, x0, #32 + mov v5.s[1], w0 + add x4, x20, x19, lsr #26 // ubignum_of_xreglist 9 // INTERMEDIATE bce + umlal v26.2D, v2.2S, v11.2S + add w28, w3, w3, lsl #1; + umlal v20.2D, v2.2S, v23.2S + add w28, w28, w3, lsl #4 + umull x8, w12, w5 + ldr x0, [tmpa+24] + mov v6.s[1], w0 + lsr x0, x0, #32 + mov v7.s[1], w0 + and x30, x25, #0x3ffffff // ubignum_of_xreglist 4 // INTERMEDIATE bce + mul v16.2S, v18.2S, v31.2S + add w17, w4, w4, lsl #1; + umull v21.2D, v1.2S, v15.2S + add w17, w17, w4, lsl #4 + umaddl x25, w21, w7, x8 + umlal v20.2D, v4.2S, v11.2S + add w8, w21, w21, lsl #1; + ldr x0, [tmpa+32] + add w8, w8, w21, lsl #4 + mov v8.s[1], w0 + lsr x0, x0, #32 + mov v9.s[1], w0 + and x2, x2, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bce + umlal v29.2D, v3.2S, v15.2S + umaddl x24, w2, w6, x25 + umull v13.2D, v0.2S, v25.2S + umaddl x25, w2, w7, x27 + umaddl x0, w3, w6, x25 + mul v19.2S, v15.2S, v31.2S + umull v27.2D, v0.2S, v18.2S + umaddl x20, w3, w13, x24 + umlal v20.2D, v6.2S, v12.2S + umaddl x24, w21, w14, x1 + umlal v13.2D, v2.2S, v18.2S + umaddl x9, w4, w13, x0 + umull v25.2D, v0.2S, v11.2S + umaddl x20, w17, w23, x20 + umlal v27.2D, v2.2S, v15.2S + umaddl x0, w2, w26, x24 + umull v28.2D, v1.2S, v11.2S + umull x24, w17, w5 + umlal v29.2D, v5.2S, v23.2S + umaddl x9, w11, w22, x9 + umlal v13.2D, v4.2S, v15.2S + umaddl x27, w3, w16, x0 + umlal v27.2D, v4.2S, v23.2S + umull x0, w17, w14 + umlal v27.2D, v6.2S, v11.2S + umull x4, w12, w14 + umlal v27.2D, v8.2S, v12.2S + umaddl x25, w11, w10, x20 + umlal v27.2D, v1.2S, v17.2S + umaddl x0, w28, w10, x0 + umlal v13.2D, v6.2S, v23.2S + umull x3, w17, w6 + umlal v13.2D, v8.2S, v11.2S + umaddl x1, w21, w26, x4 + umlal v20.2D, v8.2S, v16.2S + umaddl x4, w2, w13, x24 + umlal v28.2D, v3.2S, v12.2S + umaddl x20, w28, w7, x3 + umlal v29.2D, v7.2S, v11.2S + and x3, x19, #0x3ffffff // ubignum_of_xreglist 9 // INTERMEDIATE bce + umlal v29.2D, v9.2S, v12.2S + umaddl x19, w17, w22, x27 + add w27, w2, w2, lsl #1; + mul v18.2S, v24.2S, v31.2S + add w27, w27, w2, lsl #4 + umlal v21.2D, v3.2S, v23.2S + umull x24, w17, w7 + umlal v13.2D, v1.2S, v24.2S + add x19, x19, x19 + shl v29.2D, v29.2D, #1 + umaddl x1, w2, w16, x1 + umull v15.2D, v1.2S, v23.2S + umaddl x0, w27, w22, x0 + umlal v29.2D, v0.2S, v24.2S + umaddl x2, w28, w5, x24 + mul v24.2S, v23.2S, v31.2S + umaddl x4, w28, w23, x4 + umlal v21.2D, v5.2S, v11.2S + umaddl x24, w27, w5, x20 + umlal v20.2D, v1.2S, v14.2S + umaddl x20, w11, w23, x19 + umlal v26.2D, v4.2S, v12.2S + umaddl x19, w27, w23, x2 + umlal v26.2D, v6.2S, v16.2S + umaddl x2, w21, w6, x4 + umlal v29.2D, v2.2S, v17.2S + umaddl x24, w8, w23, x24 + umlal v15.2D, v3.2S, v11.2S + umaddl x0, w21, w16, x0 + umaddl x4, w21, w13, x19 + mul v23.2S, v11.2S, v31.2S + umlal v20.2D, v3.2S, v22.2S + umaddl x2, w12, w7, x2 + umlal v20.2D, v5.2S, v10.2S + umaddl x19, w12, w26, x0 + umlal v29.2D, v4.2S, v14.2S + umaddl x0, w12, w13, x24 + umlal v26.2D, v8.2S, v19.2S + umaddl x20, w15, w5, x20 + umlal v26.2D, v1.2S, v22.2S + umaddl x21, w15, w10, x9 + umlal v26.2D, v3.2S, v10.2S + and x9, x29, #0x3ffffff // ubignum_of_xreglist 6 // INTERMEDIATE bce + umlal v29.2D, v6.2S, v22.2S + umaddl x20, w30, w7, x20 + umaddl x1, w28, w22, x1 + add x24, x19, x19 + umull v11.2D, v1.2S, v12.2S + add w19, w3, w3, lsl #1; + umlal v26.2D, v5.2S, v18.2S + add w19, w19, w3, lsl #4 + umaddl x20, w9, w6, x20 + umlal v29.2D, v8.2S, v10.2S + add w29, w9, w9, lsl #1; + umlal v13.2D, v3.2S, v17.2S + add w29, w29, w9, lsl #4 + umaddl x2, w19, w10, x2 + umlal v11.2D, v3.2S, v16.2S + umaddl x21, w30, w14, x21 + umlal v11.2D, v5.2S, v19.2S + umaddl x20, w3, w13, x20 + umlal v11.2D, v7.2S, v24.2S + umaddl x2, w29, w22, x2 + umlal v11.2D, v9.2S, v23.2S + umaddl x21, w9, w26, x21 + ushr v23.2D, v30.2D, #1 + umaddl x1, w17, w10, x1 + umlal v13.2D, v5.2S, v14.2S + umaddl x24, w19, w5, x24 + umlal v27.2D, v3.2S, v14.2S + umaddl x21, w3, w16, x21 + shl v11.2D, v11.2D, #1 + add w3, w30, w30, lsl #1; + umlal v28.2D, v5.2S, v16.2S + add w3, w3, w30, lsl #4 + umaddl x24, w29, w23, x24 + umlal v28.2D, v7.2S, v19.2S + add x1, x1, x1 + umlal v28.2D, v9.2S, v24.2S + umaddl x1, w11, w5, x1 + umlal v15.2D, v5.2S, v12.2S + umaddl x24, w30, w13, x24 + umlal v15.2D, v7.2S, v16.2S + umaddl x25, w15, w14, x25 + umlal v15.2D, v9.2S, v19.2S + umaddl x1, w15, w7, x1 + shl v28.2D, v28.2D, #1 + umaddl x24, w15, w6, x24 + umlal v21.2D, v7.2S, v12.2S + umaddl x2, w30, w16, x2 + umlal v21.2D, v9.2S, v16.2S + umaddl x25, w30, w26, x25 + shl v15.2D, v15.2D, #1 + umaddl x30, w30, w6, x1 + umlal v28.2D, v0.2S, v22.2S + umaddl x1, w15, w26, x2 + umlal v28.2D, v2.2S, v10.2S + umaddl x2, w9, w16, x25 + shl v21.2D, v21.2D, #1 + umaddl x24, w11, w7, x24 + umlal v15.2D, v0.2S, v14.2S + umaddl x1, w11, w14, x1 + umlal v21.2D, v0.2S, v17.2S + umaddl x25, w9, w13, x30 + umlal v28.2D, v4.2S, v18.2S + umaddl x0, w19, w26, x0 + umlal v25.2D, v2.2S, v12.2S + add x1, x1, x24, lsr #26 + umlal v25.2D, v4.2S, v16.2S + umaddl x30, w19, w22, x2 + umlal v21.2D, v2.2S, v14.2S + umaddl x4, w12, w6, x4 + mul v14.2S, v14.2S, v31.2S + umaddl x25, w19, w23, x25 + and x2, x1, #0x1ffffff + mul v16.2S, v17.2S, v31.2S + umlal v25.2D, v6.2S, v19.2S + umaddl x9, w19, w14, x4 + umlal v13.2D, v7.2S, v22.2S + add x25, x25, x1, lsr #25 + umlal v21.2D, v4.2S, v22.2S + umaddl x0, w29, w14, x0 + umlal v26.2D, v7.2S, v16.2S + add x30, x30, x25, lsr #26 + umlal v26.2D, v9.2S, v14.2S + add w1, w15, w15, lsl #1; + umlal v28.2D, v6.2S, v16.2S + add w1, w1, w15, lsl #4 + add x4, x20, x30, lsr #25 + umlal v28.2D, v8.2S, v14.2S + and x25, x25, #0x3ffffff + umlal v15.2D, v2.2S, v22.2S + add x21, x21, x4, lsr #26 + umlal v11.2D, v0.2S, v10.2S + bfi x25, x30, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE z4 + umlal v11.2D, v2.2S, v18.2S + bic x30, x21, #0x3ffffff + usra v26.2D, v28.2D, #26 + lsr x20, x30, #26 + umlal v15.2D, v4.2S, v10.2S + add x20, x20, x30, lsr #25 + umlal v15.2D, v6.2S, v18.2S + umaddl x9, w29, w10, x9 + umlal v15.2D, v8.2S, v16.2S + add x30, x20, x30, lsr #22 + umlal v27.2D, v5.2S, v22.2S + umull x20, w17, w26 + umlal v20.2D, v7.2S, v18.2S + umaddl x30, w17, w16, x30 + umlal v20.2D, v9.2S, v16.2S + umaddl x17, w3, w10, x0 + usra v15.2D, v26.2D, #25 + umaddl x0, w28, w14, x20 + umlal v27.2D, v7.2S, v10.2S + umaddl x20, w28, w26, x30 + umlal v27.2D, v9.2S, v18.2S + add w28, w12, w12, lsl #1; + usra v20.2D, v15.2D, #26 + add w28, w28, w12, lsl #4 + umaddl x30, w27, w10, x0 + and v17.16B, v15.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x4|z5 + umaddl x27, w27, w14, x20 + umaddl x0, w8, w10, x27 + mul v12.2S, v22.2S, v31.2S + and v15.16B, v20.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x4|z5 + umaddl x14, w3, w22, x9 + umlal v21.2D, v6.2S, v10.2S + umaddl x27, w8, w22, x30 + trn1 v15.4S, v17.4S, v15.4S // FINAL z3 + umaddl x10, w28, w22, x0 + umlal v11.2D, v4.2S, v16.2S + umaddl x30, w15, w16, x14 + and v26.16B, v26.16B, v23.16B + umaddl x28, w12, w16, x27 + umlal v21.2D, v8.2S, v18.2S + add x10, x10, x10 + umlal v25.2D, v8.2S, v24.2S + umaddl x20, w19, w6, x10 + umlal v25.2D, v1.2S, v10.2S + add x28, x28, x28 + umlal v25.2D, v3.2S, v18.2S + umaddl x28, w19, w7, x28 + usra v21.2D, v20.2D, #25 + umaddl x0, w29, w7, x20 + umlal v11.2D, v6.2S, v14.2S + umaddl x10, w11, w26, x30 + umlal v13.2D, v9.2S, v10.2S + umaddl x19, w29, w5, x28 + usra v27.2D, v21.2D, #26 + umaddl x0, w3, w5, x0 + umlal v25.2D, v5.2S, v16.2S + umaddl x20, w1, w22, x17 + and v20.16B, v28.16B, v30.16B + umaddl x29, w3, w23, x19 + usra v29.2D, v27.2D, #25 + umaddl x3, w1, w23, x0 + and v27.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x4|z5 + umlal v11.2D, v8.2S, v12.2S + umaddl x12, w15, w13, x29 + usra v13.2D, v29.2D, #26 + umaddl x7, w11, w13, x3 + trn1 v6.4S, v6.4S, v7.4S + umaddl x17, w11, w16, x20 + umlal v25.2D, v7.2S, v14.2S + and x23, x4, #0x3ffffff + bic v19.16B, v13.16B, v23.16B + umaddl x19, w11, w6, x12 + and v28.16B, v13.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = x4|z5 + add x3, x17, x7, lsr #26 + usra v11.2D, v19.2D, #25 + trn1 v2.4S, v2.4S, v3.4S + add x17, x19, x3, lsr #25 + and v13.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x4|z5 + and x5, x7, #0x3ffffff + usra v11.2D, v19.2D, #24 + add x7, x10, x17, lsr #26 + trn1 v0.4S, v0.4S, v1.4S + and x19, x24, #0x3ffffff + and v21.16B, v29.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x4|z5 + add x29, x19, x7, lsr #25 + usra v11.2D, v19.2D, #21 + bfi x5, x3, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE z4 + trn1 v17.4S, v13.4S, v27.4S // FINAL z3 + add x19, x2, x29, lsr #26 + trn1 v19.4S, v21.4S, v28.4S // FINAL z3 + and x3, x29, #0x3ffffff + mov v16.d[0], v6.d[1] // FINAL x3 + mov v6.d[0], v17.d[1] // FINAL x2 + trn1 v8.4S, v8.4S, v9.4S + bfi x3, x19, #32, #26 // ubignum_of_preglist 2 // INTERMEDIATE z4 + and v21.16B, v11.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x4|z5 + bfi x23, x21, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE z4 + mov v18.d[0], v8.d[1] // FINAL x3 + mov v8.d[0], v19.d[1] // FINAL x2 + umlal v25.2D, v9.2S, v12.2S + mov v9.d[0], x23 // FINAL z2 + mov v7.d[0], x25 // FINAL z2 + ldr d29, [mask1] + mov v12.d[0], v2.d[1] // FINAL x3 + trn1 v4.4S, v4.4S, v5.4S + and x17, x17, #0x3ffffff + usra v25.2D, v11.2D, #26 + mov v10.d[0], v0.d[1] // FINAL x3 + mov v14.d[0], v4.d[1] // FINAL x3 + mov v4.d[0], v15.d[1] // FINAL x2 + usra v20.2D, v25.2D, #25 + and v27.16B, v25.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 + bfi x17, x7, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE z4 + mov v5.d[0], x3 // depth 86 + mov v1.d[0], x5 // FINAL z2 + usra v26.2D, v20.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 + and v28.16B, v20.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 + trn1 v11.4S, v21.4S, v27.4S // FINAL z3 + trn1 v13.4S, v28.4S, v26.4S // FINAL z3 + mov v0.d[0], v11.d[1] // FINAL x2 + mov v3.d[0], x17 // FINAL z2 + mov v2.d[0], v13.d[1] // FINAL x2 + ldr d28, [mask2] + + ldr x0, [i] + subs x0, x0, #1 + str x0, [i] bcs curve25519_x25519_byte_scalarloop -// Multiplex directly into (xn,zn) then do three pure doubling steps; -// this accounts for the implicit zeroing of the three lowest bits -// of the scalar. On the very last doubling we *fully* reduce zn mod -// p_25519 to ease checking for degeneracy below. - - cmp swap, xzr - mux_4(xn,xm,xn) - mux_4(zn,zm,zn) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_4(zn,p,e) - - sub_twice4(d,xn,zn) - add_twice4(s,xn,zn) - sqr_4(d,d) - sqr_4(s,s) - sub_twice4(p,s,d) - mov x1, 0xdb42 - orr x1, x1, 0x10000 - cmadd_4(e,p,d) - mul_4(xn,s,d) - mul_p25519(zn,p,e) - -// The projective result of the scalar multiplication is now (xn,zn). -// Prepare to call the modular inverse function to get xm = 1/zn - - add x0, xm +// Repack X2 into the saturated representation as 256-bit value xn. +// This does not fully normalize mod 2^255-19 but stays within 256 bits. + + mov w0, v0.s[0] + mov w1, v0.s[1] + mov w2, v2.s[0] + mov w3, v2.s[1] + mov w4, v4.s[0] + mov w5, v4.s[1] + mov w6, v6.s[0] + mov w7, v6.s[1] + mov w8, v8.s[0] + mov w9, v8.s[1] + + add x0, x0, x1, lsl #26 + add x1, x2, x3, lsl #26 + add x2, x4, x5, lsl #26 + add x3, x6, x7, lsl #26 + add x4, x8, x9, lsl #26 + + adds x0, x0, x1, lsl #51 + lsr x6, x1, #13 + lsl x7, x2, #38 + adcs x1, x6, x7 + lsr x8, x2, #26 + lsl x9, x3, #25 + adcs x2, x8, x9 + lsr x10, x3, #39 + lsl x11, x4, #12 + adc x3, x10, x11 + stp x0, x1, [xn] + stp x2, x3, [xn+16] + +// Repack Z2 into the saturated representation as 256-bit value zn. +// This does not fully normalize mod 2^255-19. However since Z2, +// unlike X2, was not repacked (within the last multiplication) in +// right-to-left order, its top digit can be any 26-bit value, on +// the face of it. To make sure we don't overflow 256 bits here +// we remove b = 25th bit of the 9th digit (now scaled by 2^230 +// giving bit 25 a final weighting of 2^255) and add 19 * b to +// to the bottom of the sum here to compensate mod 2^255-19. + + mov w0, v1.s[0] + mov w1, v1.s[1] + mov w2, v3.s[0] + mov w3, v3.s[1] + mov w4, v5.s[0] + mov w5, v5.s[1] + mov w6, v7.s[0] + mov w7, v7.s[1] + mov w8, v9.s[0] + mov w9, v9.s[1] + + mov w10, #19 + add x0, x0, x1, lsl #26 + tst x9, #0x2000000 + add x1, x2, x3, lsl #26 + csel x10, x10, xzr, ne + add x2, x4, x5, lsl #26 + and x9, x9, #0x1FFFFFF + add x3, x6, x7, lsl #26 + add x0, x0, x10 + add x4, x8, x9, lsl #26 + + adds x0, x0, x1, lsl #51 + lsr x6, x1, #13 + lsl x7, x2, #38 + adcs x1, x6, x7 + lsr x8, x2, #26 + lsl x9, x3, #25 + adcs x2, x8, x9 + lsr x10, x3, #39 + lsl x11, x4, #12 + adc x3, x10, x11 + stp x0, x1, [zn] + stp x2, x3, [zn+16] + +// Because the lowest bit (indeed, the three lowest bits) of the scalar +// were forced to zero, we know that the projective result of the scalar +// multiplication was in (X2,Z2) and is now (xn,zn) in saturated form. +// Prepare to call the modular inverse function to get zn' = 1/zn. + + add x0, zn add x1, zn // Inline copy of bignum_inv_p25519, identical except for stripping out @@ -978,7 +1466,7 @@ curve25519_x25519_byte_scalarloop: // and reclaiming room on the stack. For more details and explanations see // "arm/curve25519/bignum_inv_p25519.S". Note that the stack it uses for // its own temporaries is 128 bytes, so it has no effect on variables -// that are needed in the rest of our computation here: res, xm and zn. +// that are needed in the rest of our computation here: res, xn, and zn. mov x20, x0 mov x10, #0xffffffffffffffed @@ -2009,102 +2497,278 @@ curve25519_x25519_byte_invmidloop: stp x0, x1, [x4] stp x2, x5, [x4, #16] -// Since we eventually want to return 0 when the result is the point at -// infinity, we force xn = 0 whenever zn = 0. This avoids building in a -// dependency on the behavior of modular inverse in out-of-scope cases. - - ldp x0, x1, [zn] - ldp x2, x3, [zn+16] - orr x0, x0, x1 - orr x2, x2, x3 - orr x4, x0, x2 - cmp x4, xzr - ldp x0, x1, [xn] - csel x0, x0, xzr, ne - csel x1, x1, xzr, ne - ldp x2, x3, [xn+16] - stp x0, x1, [xn] - csel x2, x2, xzr, ne - csel x3, x3, xzr, ne - stp x2, x3, [xn+16] - // Now the result is xn * (1/zn), fully reduced modulo p. +// Note that in the degenerate case zn = 0 (mod p_25519), the +// modular inverse code above will produce 1/zn = 0, giving +// the correct overall X25519 result of zero for the point at +// infinity. The multiplication below is just an inlined +// version of bignum_mul_p25519 except for the detailed +// addressing of inputs and outputs + + ldp x3, x4, [xn] + ldp x5, x6, [zn] + umull x7, w3, w5 + lsr x0, x3, #32 + umull x15, w0, w5 + lsr x16, x5, #32 + umull x8, w16, w0 + umull x16, w3, w16 + adds x7, x7, x15, lsl #32 + lsr x15, x15, #32 + adc x8, x8, x15 + adds x7, x7, x16, lsl #32 + lsr x16, x16, #32 + adc x8, x8, x16 + mul x9, x4, x6 + umulh x10, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x16, cc + adds x9, x9, x8 + adc x10, x10, xzr + subs x3, x5, x6 + cneg x3, x3, cc + cinv x16, x16, cc + mul x15, x4, x3 + umulh x3, x4, x3 + adds x8, x7, x9 + adcs x9, x9, x10 + adc x10, x10, xzr + cmn x16, #0x1 + eor x15, x15, x16 + adcs x8, x15, x8 + eor x3, x3, x16 + adcs x9, x3, x9 + adc x10, x10, x16 + ldp x3, x4, [xn+16] + ldp x5, x6, [zn+16] + umull x11, w3, w5 + lsr x0, x3, #32 + umull x15, w0, w5 + lsr x16, x5, #32 + umull x12, w16, w0 + umull x16, w3, w16 + adds x11, x11, x15, lsl #32 + lsr x15, x15, #32 + adc x12, x12, x15 + adds x11, x11, x16, lsl #32 + lsr x16, x16, #32 + adc x12, x12, x16 + mul x13, x4, x6 + umulh x14, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x16, cc + adds x13, x13, x12 + adc x14, x14, xzr + subs x3, x5, x6 + cneg x3, x3, cc + cinv x16, x16, cc + mul x15, x4, x3 + umulh x3, x4, x3 + adds x12, x11, x13 + adcs x13, x13, x14 + adc x14, x14, xzr + cmn x16, #0x1 + eor x15, x15, x16 + adcs x12, x15, x12 + eor x3, x3, x16 + adcs x13, x3, x13 + adc x14, x14, x16 + ldp x3, x4, [xn+16] + ldp x15, x16, [xn] + subs x3, x3, x15 + sbcs x4, x4, x16 + csetm x16, cc + ldp x15, x0, [zn] + subs x5, x15, x5 + sbcs x6, x0, x6 + csetm x0, cc + eor x3, x3, x16 + subs x3, x3, x16 + eor x4, x4, x16 + sbc x4, x4, x16 + eor x5, x5, x0 + subs x5, x5, x0 + eor x6, x6, x0 + sbc x6, x6, x0 + eor x16, x0, x16 + adds x11, x11, x9 + adcs x12, x12, x10 + adcs x13, x13, xzr + adc x14, x14, xzr + mul x2, x3, x5 + umulh x0, x3, x5 + mul x15, x4, x6 + umulh x1, x4, x6 + subs x4, x4, x3 + cneg x4, x4, cc + csetm x9, cc + adds x15, x15, x0 + adc x1, x1, xzr + subs x6, x5, x6 + cneg x6, x6, cc + cinv x9, x9, cc + mul x5, x4, x6 + umulh x6, x4, x6 + adds x0, x2, x15 + adcs x15, x15, x1 + adc x1, x1, xzr + cmn x9, #0x1 + eor x5, x5, x9 + adcs x0, x5, x0 + eor x6, x6, x9 + adcs x15, x6, x15 + adc x1, x1, x9 + adds x9, x11, x7 + adcs x10, x12, x8 + adcs x11, x13, x11 + adcs x12, x14, x12 + adcs x13, x13, xzr + adc x14, x14, xzr + cmn x16, #0x1 + eor x2, x2, x16 + adcs x9, x2, x9 + eor x0, x0, x16 + adcs x10, x0, x10 + eor x15, x15, x16 + adcs x11, x15, x11 + eor x1, x1, x16 + adcs x12, x1, x12 + adcs x13, x13, x16 + adc x14, x14, x16 + mov x3, #0x26 + umull x4, w11, w3 + add x4, x4, w7, uxtw + lsr x7, x7, #32 + lsr x11, x11, #32 + umaddl x11, w11, w3, x7 + mov x7, x4 + umull x4, w12, w3 + add x4, x4, w8, uxtw + lsr x8, x8, #32 + lsr x12, x12, #32 + umaddl x12, w12, w3, x8 + mov x8, x4 + umull x4, w13, w3 + add x4, x4, w9, uxtw + lsr x9, x9, #32 + lsr x13, x13, #32 + umaddl x13, w13, w3, x9 + mov x9, x4 + umull x4, w14, w3 + add x4, x4, w10, uxtw + lsr x10, x10, #32 + lsr x14, x14, #32 + umaddl x14, w14, w3, x10 + mov x10, x4 + lsr x0, x14, #31 + mov x5, #0x13 + umaddl x5, w5, w0, x5 + add x7, x7, x5 + adds x7, x7, x11, lsl #32 + extr x3, x12, x11, #32 + adcs x8, x8, x3 + extr x3, x13, x12, #32 + adcs x9, x9, x3 + extr x3, x14, x13, #32 + lsl x5, x0, #63 + eor x10, x10, x5 + adc x10, x10, x3 + mov x3, #0x13 + tst x10, #0x8000000000000000 + csel x3, x3, xzr, pl + subs x7, x7, x3 + sbcs x8, x8, xzr + sbcs x9, x9, xzr + sbc x10, x10, xzr + and x10, x10, #0x7fffffffffffffff + stp x7, x8, [zn] + stp x9, x10, [zn+16] + +// Now copy bytewise to the output - mul_p25519(zn,xn,xm) + ldr x17, [res] ldp x10, x11, [zn] - strb w10, [resx] + strb w10, [x17] lsr x10, x10, #8 - strb w10, [resx+1] + strb w10, [x17, #1] lsr x10, x10, #8 - strb w10, [resx+2] + strb w10, [x17, #2] lsr x10, x10, #8 - strb w10, [resx+3] + strb w10, [x17, #3] lsr x10, x10, #8 - strb w10, [resx+4] + strb w10, [x17, #4] lsr x10, x10, #8 - strb w10, [resx+5] + strb w10, [x17, #5] lsr x10, x10, #8 - strb w10, [resx+6] + strb w10, [x17, #6] lsr x10, x10, #8 - strb w10, [resx+7] + strb w10, [x17, #7] - strb w11, [resx+8] + strb w11, [x17, #8] lsr x11, x11, #8 - strb w11, [resx+9] + strb w11, [x17, #9] lsr x11, x11, #8 - strb w11, [resx+10] + strb w11, [x17, #10] lsr x11, x11, #8 - strb w11, [resx+11] + strb w11, [x17, #11] lsr x11, x11, #8 - strb w11, [resx+12] + strb w11, [x17, #12] lsr x11, x11, #8 - strb w11, [resx+13] + strb w11, [x17, #13] lsr x11, x11, #8 - strb w11, [resx+14] + strb w11, [x17, #14] lsr x11, x11, #8 - strb w11, [resx+15] + strb w11, [x17, #15] ldp x12, x13, [zn+16] - strb w12, [resx+16] + strb w12, [x17, #16] lsr x12, x12, #8 - strb w12, [resx+17] + strb w12, [x17, #17] lsr x12, x12, #8 - strb w12, [resx+18] + strb w12, [x17, #18] lsr x12, x12, #8 - strb w12, [resx+19] + strb w12, [x17, #19] lsr x12, x12, #8 - strb w12, [resx+20] + strb w12, [x17, #20] lsr x12, x12, #8 - strb w12, [resx+21] + strb w12, [x17, #21] lsr x12, x12, #8 - strb w12, [resx+22] + strb w12, [x17, #22] lsr x12, x12, #8 - strb w12, [resx+23] + strb w12, [x17, #23] - strb w13, [resx+24] + strb w13, [x17, #24] lsr x13, x13, #8 - strb w13, [resx+25] + strb w13, [x17, #25] lsr x13, x13, #8 - strb w13, [resx+26] + strb w13, [x17, #26] lsr x13, x13, #8 - strb w13, [resx+27] + strb w13, [x17, #27] lsr x13, x13, #8 - strb w13, [resx+28] + strb w13, [x17, #28] lsr x13, x13, #8 - strb w13, [resx+29] + strb w13, [x17, #29] lsr x13, x13, #8 - strb w13, [resx+30] + strb w13, [x17, #30] lsr x13, x13, #8 - strb w13, [resx+31] - -// Restore stack and registers - - add sp, sp, #NSPACE - ldp x23, x24, [sp], 16 - ldp x21, x22, [sp], 16 - ldp x19, x20, [sp], 16 - + strb w13, [x17, #31] + +// Restore stack and registers (this will zero the tops of Q8...Q15). + + ldp d8, d9, [regsave+0] + ldp d10, d11, [regsave+16] + ldp d12, d13, [regsave+32] + ldp d14, d15, [regsave+48] + ldp x19, x20, [regsave+64] + ldp x21, x22, [regsave+80] + ldp x23, x24, [regsave+96] + ldp x25, x26, [regsave+112] + ldp x27, x28, [regsave+128] + ldp x29, x30, [regsave+144] + add sp, sp, #NSPACE+160 ret #if defined(__linux__) && defined(__ELF__) From 28a5cf4aca774ca9705449afd744e3a7c7cfcb1a Mon Sep 17 00:00:00 2001 From: John Harrison Date: Wed, 14 Feb 2024 08:41:59 -0800 Subject: [PATCH 6/9] Tweak attribution of SLOTHY work s2n-bignum original commit: https://github.com/awslabs/s2n-bignum/commit/f82da8fd8015d2a0b590360edb9afb3c842cfea6 --- arm/curve25519/curve25519_x25519.S | 8 ++++---- arm/curve25519/curve25519_x25519_byte.S | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/arm/curve25519/curve25519_x25519.S b/arm/curve25519/curve25519_x25519.S index 28dd2f696a..eeefa69b0c 100644 --- a/arm/curve25519/curve25519_x25519.S +++ b/arm/curve25519/curve25519_x25519.S @@ -7,7 +7,8 @@ // https://github.com/Emill/X25519-AArch64/blob/master/X25519_AArch64.pdf // https://github.com/Emill/X25519-AArch64 // -// and the SLOTHY-based re-engineering of that code by Hanno Becker: +// and the SLOTHY-based re-engineering of that code by Abdulrahman, Becker, +// Kannwischer and Klein: // // https://eprint.iacr.org/2022/1303.pdf // https://github.com/slothy-optimizer/slothy/tree/main/paper @@ -202,9 +203,8 @@ curve25519_x25519_scalarloop: // In particular, the basic dataflow and the organization between integer // and SIMD units is identical, with only a few minor changes to some // individual instructions (for miscellaneous reasons). The scheduling -// was redone from scratch by SLOTHY starting from Hanno Becker's -// un-interleaved form and using the same scripts as in Becker et al's -// paper. +// was redone from scratch by SLOTHY starting from the un-interleaved +// form in the SLOTHY work cited above, and using the same scripts. // // The intermediate value annotations were added to provide data that // is used in the formal proof, indicating which lines assign specific diff --git a/arm/curve25519/curve25519_x25519_byte.S b/arm/curve25519/curve25519_x25519_byte.S index e6c891284d..89f2f44f4e 100644 --- a/arm/curve25519/curve25519_x25519_byte.S +++ b/arm/curve25519/curve25519_x25519_byte.S @@ -7,7 +7,8 @@ // https://github.com/Emill/X25519-AArch64/blob/master/X25519_AArch64.pdf // https://github.com/Emill/X25519-AArch64 // -// and the SLOTHY-based re-engineering of that code by Hanno Becker: +// and the SLOTHY-based re-engineering of that code by Abdulrahman, Becker, +// Kannwischer and Klein: // // https://eprint.iacr.org/2022/1303.pdf // https://github.com/slothy-optimizer/slothy/tree/main/paper @@ -320,9 +321,8 @@ curve25519_x25519_byte_scalarloop: // In particular, the basic dataflow and the organization between integer // and SIMD units is identical, with only a few minor changes to some // individual instructions (for miscellaneous reasons). The scheduling -// was redone from scratch by SLOTHY starting from Hanno Becker's -// un-interleaved form and using the same scripts as in Becker et al's -// paper. +// was redone from scratch by SLOTHY starting from the un-interleaved +// form in the SLOTHY work cited above, and using the same scripts. // // The intermediate value annotations were added to provide data that // is used in the formal proof, indicating which lines assign specific From 7ebcca7e335132e201ffddfd5a087ceade8e2411 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Tue, 5 Mar 2024 08:00:06 +0000 Subject: [PATCH 7/9] constant-fold (1<<26)-1 --- third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S | 2 +- third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S index eeefa69b0c..de102f4f1d 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S +++ b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S @@ -166,7 +166,7 @@ S2N_BN_SYMBOL(curve25519_x25519): mov v31.d[0], x0 mov v31.d[1], xzr - mov x0, #(1<<26)-1 + mov x0, #67108863 mov v30.d[0], x0 mov v30.d[1], x0 diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S index 89f2f44f4e..600c62f7f2 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S +++ b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S @@ -284,7 +284,7 @@ S2N_BN_SYMBOL(curve25519_x25519_byte): mov v31.d[0], x0 mov v31.d[1], xzr - mov x0, #(1<<26)-1 + mov x0, #67108863 mov v30.d[0], x0 mov v30.d[1], x0 From af960beb3bd02f4466bd8bcc174a2bc8326d02e7 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Tue, 5 Mar 2024 14:30:50 +0000 Subject: [PATCH 8/9] constant-fold 0xfe-0xb4 --- third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S | 4 ++-- .../s2n-bignum/arm/curve25519/curve25519_x25519_byte.S | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S index de102f4f1d..c7d4caa066 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S +++ b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519.S @@ -166,12 +166,12 @@ S2N_BN_SYMBOL(curve25519_x25519): mov v31.d[0], x0 mov v31.d[1], xzr - mov x0, #67108863 + mov x0, #67108863 // #(1<<26)-1 mov v30.d[0], x0 mov v30.d[1], x0 mov x0, #0x07fffffe07fffffe - sub x1, x0, #0xfe-0xb4 + sub x1, x0, #74 // #0xfe-0xb4 sub x0, x0, #2 stp x0, x1, [mask1] diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S index 600c62f7f2..0be4bd007e 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S +++ b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S @@ -284,12 +284,12 @@ S2N_BN_SYMBOL(curve25519_x25519_byte): mov v31.d[0], x0 mov v31.d[1], xzr - mov x0, #67108863 + mov x0, #67108863 // #(1<<26)-1 mov v30.d[0], x0 mov v30.d[1], x0 mov x0, #0x07fffffe07fffffe - sub x1, x0, #0xfe-0xb4 + sub x1, x0, #74 // #0xfe-0xb4 sub x0, x0, #2 stp x0, x1, [mask1] From 8aa626a8b18dc0f6937879148390a6130ec5b2b8 Mon Sep 17 00:00:00 2001 From: Juneyoung Lee Date: Tue, 5 Mar 2024 16:23:39 +0000 Subject: [PATCH 9/9] Make vector elements (2S/4S/2D/16B) lower-case letters, add fcsel to delocator --- .../arm/curve25519/curve25519_x25519_byte.S | 924 +++++++++--------- util/fipstools/delocate/delocate.go | 2 +- 2 files changed, 463 insertions(+), 463 deletions(-) diff --git a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S index 0be4bd007e..73c27db9f8 100644 --- a/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S +++ b/third_party/s2n-bignum/arm/curve25519/curve25519_x25519_byte.S @@ -359,35 +359,35 @@ curve25519_x25519_byte_scalarloop: // (x2',z2') = (x4,z4) // (x3',z3') = (x5,z5) - add v22.2S, v2.2S, v3.2S // ubignum_of_qreglist 1 // INTERMEDIATE a - sub v21.2S, v28.2S, v1.2S - add v25.2S, v0.2S, v1.2S // ubignum_of_qreglist 0 // INTERMEDIATE a - sub v24.2S, v29.2S, v3.2S - add v3.2S, v18.2S, v19.2S // ubignum_of_qreglist 4 // INTERMEDIATE c - add v0.2S, v0.2S, v21.2S // ubignum_of_qreglist 0 // INTERMEDIATE b - sub v20.2S, v29.2S, v15.2S - sub v1.2S, v29.2S, v5.2S - sub v26.2S, v28.2S, v11.2S - sub v21.2S, v29.2S, v19.2S - add v19.2S, v10.2S, v11.2S // ubignum_of_qreglist 0 // INTERMEDIATE c - add v11.2S, v14.2S, v20.2S // ubignum_of_qreglist 2 // INTERMEDIATE d - add v21.2S, v18.2S, v21.2S // ubignum_of_qreglist 4 // INTERMEDIATE d - sub v20.2S, v29.2S, v17.2S - add v18.2S, v2.2S, v24.2S // ubignum_of_qreglist 1 // INTERMEDIATE b - add v14.2S, v14.2S, v15.2S // ubignum_of_qreglist 2 // INTERMEDIATE c - add v15.2S, v16.2S, v17.2S // ubignum_of_qreglist 3 // INTERMEDIATE c - add v2.2S, v16.2S, v20.2S // ubignum_of_qreglist 3 // INTERMEDIATE d - add v24.2S, v12.2S, v13.2S // ubignum_of_qreglist 1 // INTERMEDIATE c - add v26.2S, v10.2S, v26.2S // ubignum_of_qreglist 0 // INTERMEDIATE d - sub v10.2S, v29.2S, v13.2S - sub v13.2S, v29.2S, v7.2S - add v23.2S, v6.2S, v7.2S // ubignum_of_qreglist 3 // INTERMEDIATE a - sub v7.2S, v29.2S, v9.2S - add v27.2S, v12.2S, v10.2S // ubignum_of_qreglist 1 // INTERMEDIATE d + add v22.2s, v2.2s, v3.2s // ubignum_of_qreglist 1 // INTERMEDIATE a + sub v21.2s, v28.2s, v1.2s + add v25.2s, v0.2s, v1.2s // ubignum_of_qreglist 0 // INTERMEDIATE a + sub v24.2s, v29.2s, v3.2s + add v3.2s, v18.2s, v19.2s // ubignum_of_qreglist 4 // INTERMEDIATE c + add v0.2s, v0.2s, v21.2s // ubignum_of_qreglist 0 // INTERMEDIATE b + sub v20.2s, v29.2s, v15.2s + sub v1.2s, v29.2s, v5.2s + sub v26.2s, v28.2s, v11.2s + sub v21.2s, v29.2s, v19.2s + add v19.2s, v10.2s, v11.2s // ubignum_of_qreglist 0 // INTERMEDIATE c + add v11.2s, v14.2s, v20.2s // ubignum_of_qreglist 2 // INTERMEDIATE d + add v21.2s, v18.2s, v21.2s // ubignum_of_qreglist 4 // INTERMEDIATE d + sub v20.2s, v29.2s, v17.2s + add v18.2s, v2.2s, v24.2s // ubignum_of_qreglist 1 // INTERMEDIATE b + add v14.2s, v14.2s, v15.2s // ubignum_of_qreglist 2 // INTERMEDIATE c + add v15.2s, v16.2s, v17.2s // ubignum_of_qreglist 3 // INTERMEDIATE c + add v2.2s, v16.2s, v20.2s // ubignum_of_qreglist 3 // INTERMEDIATE d + add v24.2s, v12.2s, v13.2s // ubignum_of_qreglist 1 // INTERMEDIATE c + add v26.2s, v10.2s, v26.2s // ubignum_of_qreglist 0 // INTERMEDIATE d + sub v10.2s, v29.2s, v13.2s + sub v13.2s, v29.2s, v7.2s + add v23.2s, v6.2s, v7.2s // ubignum_of_qreglist 3 // INTERMEDIATE a + sub v7.2s, v29.2s, v9.2s + add v27.2s, v12.2s, v10.2s // ubignum_of_qreglist 1 // INTERMEDIATE d fcsel d20, d22, d24, eq // ubignum_of_qreglist 1 // INTERMEDIATE f - add v28.2S, v4.2S, v5.2S // ubignum_of_qreglist 2 // INTERMEDIATE a + add v28.2s, v4.2s, v5.2s // ubignum_of_qreglist 2 // INTERMEDIATE a fcsel d12, d23, d15, eq // ubignum_of_qreglist 3 // INTERMEDIATE f - add v7.2S, v8.2S, v7.2S // ubignum_of_qreglist 4 // INTERMEDIATE b + add v7.2s, v8.2s, v7.2s // ubignum_of_qreglist 4 // INTERMEDIATE b fcsel d16, d25, d19, eq // ubignum_of_qreglist 0 // INTERMEDIATE f mov x0, v20.d[0] fcsel d5, d28, d14, eq // ubignum_of_qreglist 2 // INTERMEDIATE f @@ -397,589 +397,589 @@ curve25519_x25519_byte_scalarloop: lsr x26, x0, #32 add x29, x21, x21 umull x15, w5, w29 - add v13.2S, v6.2S, v13.2S // ubignum_of_qreglist 3 // INTERMEDIATE b + add v13.2s, v6.2s, v13.2s // ubignum_of_qreglist 3 // INTERMEDIATE b add x12, x26, x26 mov x30, v5.d[0] fcsel d10, d18, d27, eq // ubignum_of_qreglist 1 // INTERMEDIATE g lsr x11, x5, #32 lsr x10, x30, #32 - trn2 v20.2S, v21.2S, v3.2S - add v9.2S, v8.2S, v9.2S // ubignum_of_qreglist 4 // INTERMEDIATE a + trn2 v20.2s, v21.2s, v3.2s + add v9.2s, v8.2s, v9.2s // ubignum_of_qreglist 4 // INTERMEDIATE a add x14, x11, x11 - trn2 v6.2S, v2.2S, v15.2S - trn1 v12.2S, v25.2S, v0.2S - add v1.2S, v4.2S, v1.2S // ubignum_of_qreglist 2 // INTERMEDIATE b - trn1 v16.2S, v23.2S, v13.2S + trn2 v6.2s, v2.2s, v15.2s + trn1 v12.2s, v25.2s, v0.2s + add v1.2s, v4.2s, v1.2s // ubignum_of_qreglist 2 // INTERMEDIATE b + trn1 v16.2s, v23.2s, v13.2s fcsel d8, d13, d2, eq // ubignum_of_qreglist 3 // INTERMEDIATE g - trn2 v17.2S, v27.2S, v24.2S + trn2 v17.2s, v27.2s, v24.2s str d29, [tmpb+32] add x17, x10, x10 - trn2 v4.2S, v28.2S, v1.2S - trn1 v5.2S, v28.2S, v1.2S - trn1 v28.2S, v2.2S, v15.2S - trn1 v2.2S, v22.2S, v18.2S + trn2 v4.2s, v28.2s, v1.2s + trn1 v5.2s, v28.2s, v1.2s + trn1 v28.2s, v2.2s, v15.2s + trn1 v2.2s, v22.2s, v18.2s fcsel d29, d0, d26, eq // ubignum_of_qreglist 0 // INTERMEDIATE g - trn2 v15.2S, v22.2S, v18.2S - umull v22.2D, v12.2S, v20.2S + trn2 v15.2s, v22.2s, v18.2s + umull v22.2d, v12.2s, v20.2s umull x22, w30, w17 stp d29, d10, [tmpb+0] - trn2 v10.2S, v23.2S, v13.2S - trn2 v23.2S, v11.2S, v14.2S - trn1 v13.2S, v27.2S, v24.2S + trn2 v10.2s, v23.2s, v13.2s + trn2 v23.2s, v11.2s, v14.2s + trn1 v13.2s, v27.2s, v24.2s fcsel d27, d1, d11, eq // ubignum_of_qreglist 2 // INTERMEDIATE g - trn1 v14.2S, v11.2S, v14.2S - umlal v22.2D, v2.2S, v6.2S + trn1 v14.2s, v11.2s, v14.2s + umlal v22.2d, v2.2s, v6.2s umull x25, w30, w30 - umlal v22.2D, v5.2S, v23.2S + umlal v22.2d, v5.2s, v23.2s add x3, x30, x30 - umlal v22.2D, v16.2S, v17.2S + umlal v22.2d, v16.2s, v17.2s add w30, w21, w21, lsl #1; stp d27, d8, [tmpb+16] add w30, w30, w21, lsl #4 - trn1 v11.2S, v26.2S, v19.2S - trn2 v8.2S, v26.2S, v19.2S - trn2 v19.2S, v25.2S, v0.2S - mul v29.2S, v20.2S, v31.2S + trn1 v11.2s, v26.2s, v19.2s + trn2 v8.2s, v26.2s, v19.2s + trn2 v19.2s, v25.2s, v0.2s + mul v29.2s, v20.2s, v31.2s ldr x20, [tmpb+24] - umull v25.2D, v19.2S, v6.2S + umull v25.2d, v19.2s, v6.2s add x1, x0, x0 - umull v27.2D, v19.2S, v23.2S + umull v27.2d, v19.2s, v23.2s umull x9, w5, w1 - umull v0.2D, v12.2S, v23.2S + umull v0.2d, v12.2s, v23.2s lsr x24, x20, #32 - mul v20.2S, v23.2S, v31.2S + mul v20.2s, v23.2s, v31.2s lsr x16, x21, #32 - umlal v25.2D, v15.2S, v23.2S + umlal v25.2d, v15.2s, v23.2s umaddl x13, w11, w14, x9 - umlal v25.2D, v4.2S, v17.2S + umlal v25.2d, v4.2s, v17.2s umaddl x9, w14, w17, x15 - umull v24.2D, v12.2S, v6.2S + umull v24.2d, v12.2s, v6.2s add w2, w16, w16, lsl #1; fcsel d26, d9, d3, eq // ubignum_of_qreglist 4 // INTERMEDIATE f add w2, w2, w16, lsl #4 - trn1 v18.2S, v21.2S, v3.2S - umull v3.2D, v19.2S, v29.2S + trn1 v18.2s, v21.2s, v3.2s + umull v3.2d, v19.2s, v29.2s umull x28, w5, w3 - mul v1.2S, v6.2S, v31.2S + mul v1.2s, v6.2s, v31.2s umull x8, w5, w5 - umlal v24.2D, v2.2S, v23.2S + umlal v24.2d, v2.2s, v23.2s umaddl x13, w21, w30, x13 - mul v23.2S, v17.2S, v31.2S + mul v23.2s, v17.2s, v31.2s umaddl x27, w14, w12, x28 - trn2 v6.2S, v9.2S, v7.2S + trn2 v6.2s, v9.2s, v7.2s mov x6, v26.d[0] - umlal v3.2D, v15.2S, v1.2S + umlal v3.2d, v15.2s, v1.2s add x16, x16, x16 - umlal v3.2D, v4.2S, v20.2S + umlal v3.2d, v4.2s, v20.2s lsr x4, x6, #32 - umlal v3.2D, v10.2S, v23.2S + umlal v3.2d, v10.2s, v23.2s add x7, x6, x6 - umull v26.2D, v19.2S, v8.2S + umull v26.2d, v19.2s, v8.2s add x23, x4, x4 umaddl x28, w5, w23, x22 - trn1 v7.2S, v9.2S, v7.2S - umlal v27.2D, v15.2S, v17.2S + trn1 v7.2s, v9.2s, v7.2s + umlal v27.2d, v15.2s, v17.2s add w15, w4, w4, lsl #1; - umlal v27.2D, v4.2S, v8.2S + umlal v27.2d, v4.2s, v8.2s add w15, w15, w4, lsl #4 add w22, w10, w10, lsl #1; - umlal v24.2D, v5.2S, v17.2S + umlal v24.2d, v5.2s, v17.2s add w22, w22, w10, lsl #4 umaddl x10, w11, w7, x28 - umlal v25.2D, v10.2S, v8.2S + umlal v25.2d, v10.2s, v8.2s umull x21, w5, w16 - umlal v25.2D, v6.2S, v29.2S + umlal v25.2d, v6.2s, v29.2s umaddl x23, w15, w23, x25 - umlal v27.2D, v10.2S, v29.2S + umlal v27.2d, v10.2s, v29.2s umull x19, w5, w12 - umlal v27.2D, v6.2S, v1.2S + umlal v27.2d, v6.2s, v1.2s umaddl x25, w11, w29, x21 - umlal v0.2D, v2.2S, v17.2S + umlal v0.2d, v2.2s, v17.2s umaddl x28, w0, w3, x9 - shl v21.2D, v25.2D, #1 + shl v21.2d, v25.2d, #1 umaddl x4, w11, w1, x19 umaddl x21, w2, w29, x4 - mul v25.2S, v8.2S, v31.2S - umlal v24.2D, v16.2S, v8.2S + mul v25.2s, v8.2s, v31.2s + umlal v24.2d, v16.2s, v8.2s umaddl x19, w0, w17, x25 - umlal v24.2D, v7.2S, v29.2S + umlal v24.2d, v7.2s, v29.2s umull x25, w5, w17 - umlal v24.2D, v19.2S, v28.2S + umlal v24.2d, v19.2s, v28.2s umaddl x4, w0, w16, x10 - umull v9.2D, v12.2S, v8.2S + umull v9.2d, v12.2s, v8.2s umaddl x23, w5, w7, x23 - umlal v21.2D, v12.2S, v18.2S + umlal v21.2d, v12.2s, v18.2s add w10, w6, w6, lsl #1; - shl v27.2D, v27.2D, #1 + shl v27.2d, v27.2d, #1 add w10, w10, w6, lsl #4 umaddl x28, w26, w12, x28 - umlal v26.2D, v15.2S, v29.2S + umlal v26.2d, v15.2s, v29.2s umaddl x9, w14, w16, x23 - umlal v9.2D, v2.2S, v29.2S + umlal v9.2d, v2.2s, v29.2s umaddl x22, w22, w17, x8 - umlal v21.2D, v2.2S, v28.2S + umlal v21.2d, v2.2s, v28.2s umaddl x28, w6, w10, x28 umaddl x27, w0, w0, x27 add x8, x14, x14 - umlal v0.2D, v5.2S, v8.2S + umlal v0.2d, v5.2s, v8.2s umull x5, w5, w14 - umlal v9.2D, v5.2S, v1.2S + umlal v9.2d, v5.2s, v1.2s umaddl x14, w0, w29, x9 - umlal v26.2D, v4.2S, v1.2S + umlal v26.2d, v4.2s, v1.2s umaddl x6, w2, w16, x27 - umlal v22.2D, v7.2S, v8.2S + umlal v22.2d, v7.2s, v8.2s umaddl x5, w30, w17, x5 umaddl x5, w2, w3, x5 add x23, x17, x17 - umlal v27.2D, v12.2S, v28.2S + umlal v27.2d, v12.2s, v28.2s umaddl x13, w2, w23, x13 - umlal v26.2D, v10.2S, v20.2S + umlal v26.2d, v10.2s, v20.2s add x9, x12, x12 - umlal v9.2D, v16.2S, v20.2S + umlal v9.2d, v16.2s, v20.2s umaddl x27, w10, w29, x6 - umlal v0.2D, v16.2S, v29.2S + umlal v0.2d, v16.2s, v29.2s umaddl x6, w11, w3, x25 - umlal v22.2D, v19.2S, v18.2S + umlal v22.2d, v19.2s, v18.2s umaddl x19, w26, w3, x19 - mul v18.2S, v18.2S, v31.2S + mul v18.2s, v18.2s, v31.2s umaddl x23, w15, w23, x27 - umlal v3.2D, v6.2S, v25.2S + umlal v3.2d, v6.2s, v25.2s umaddl x0, w0, w12, x6 - umlal v0.2D, v7.2S, v1.2S + umlal v0.2d, v7.2s, v1.2s add x11, x16, x16 - umlal v9.2D, v7.2S, v23.2S + umlal v9.2d, v7.2s, v23.2s umaddl x6, w12, w17, x14 - umlal v9.2D, v19.2S, v11.2S + umlal v9.2d, v19.2s, v11.2s umaddl x25, w26, w29, x4 - umlal v9.2D, v15.2S, v18.2S + umlal v9.2d, v15.2s, v18.2s umaddl x14, w10, w3, x13 - umull v25.2D, v12.2S, v17.2S + umull v25.2d, v12.2s, v17.2s umaddl x27, w10, w16, x0 - umlal v26.2D, v6.2S, v23.2S + umlal v26.2d, v6.2s, v23.2s add x0, x25, x6, lsr #26 - mul v23.2S, v28.2S, v31.2S + mul v23.2s, v28.2s, v31.2s umaddl x12, w10, w12, x5 - shl v3.2D, v3.2D, #1 + shl v3.2d, v3.2d, #1 add x16, x22, x0, lsr #25 - umlal v21.2D, v5.2S, v14.2S + umlal v21.2d, v5.2s, v14.2s bic x22, x0, #0x1ffffff - umlal v3.2D, v12.2S, v11.2S + umlal v3.2d, v12.2s, v11.2s add x26, x16, x22, lsr #24 - umlal v3.2D, v2.2S, v18.2S + umlal v3.2d, v2.2s, v18.2s umaddl x16, w10, w17, x21 - umlal v3.2D, v5.2S, v23.2S + umlal v3.2d, v5.2s, v23.2s add x22, x26, x22, lsr #21 - umlal v9.2D, v4.2S, v23.2S + umlal v9.2d, v4.2s, v23.2s umaddl x5, w15, w29, x27 - umull v17.2D, v19.2S, v17.2S + umull v17.2d, v19.2s, v17.2s umaddl x17, w30, w3, x22 - umlal v25.2D, v2.2S, v8.2S + umlal v25.2d, v2.2s, v8.2s umaddl x25, w15, w3, x16 - umlal v25.2D, v5.2S, v29.2S + umlal v25.2d, v5.2s, v29.2s umaddl x26, w15, w7, x19 - umlal v0.2D, v19.2S, v14.2S + umlal v0.2d, v19.2s, v14.2s umaddl x17, w2, w9, x17 - umlal v17.2D, v15.2S, v8.2S + umlal v17.2d, v15.2s, v8.2s ldr x19, [tmpb+0] - umlal v17.2D, v4.2S, v29.2S + umlal v17.2d, v4.2s, v29.2s ldr x7, [tmpb+8] - shl v29.2D, v26.2D, #1 + shl v29.2d, v26.2d, #1 umaddl x13, w10, w1, x17 - umlal v0.2D, v15.2S, v13.2S + umlal v0.2d, v15.2s, v13.2s lsr x2, x19, #32 - umlal v29.2D, v12.2S, v13.2S + umlal v29.2d, v12.2s, v13.2s umaddl x27, w15, w1, x12 - umlal v29.2D, v2.2S, v11.2S + umlal v29.2d, v2.2s, v11.2s umaddl x30, w15, w8, x13 - umlal v29.2D, v5.2S, v18.2S + umlal v29.2d, v5.2s, v18.2s add x4, x7, x7 - umlal v29.2D, v16.2S, v23.2S + umlal v29.2d, v16.2s, v23.2s umaddl x29, w15, w9, x14 - umlal v0.2D, v4.2S, v11.2S + umlal v0.2d, v4.2s, v11.2s add x17, x27, x30, lsr #26 - umlal v0.2D, v10.2S, v18.2S + umlal v0.2d, v10.2s, v18.2s umaddl x16, w15, w11, x28 - umlal v0.2D, v6.2S, v23.2S + umlal v0.2d, v6.2s, v23.2s add x1, x29, x17, lsr #25 - umlal v25.2D, v16.2S, v1.2S + umlal v25.2d, v16.2s, v1.2s umull x11, w19, w4 ldr x8, [tmpb+32] - mul v26.2S, v14.2S, v31.2S - umlal v17.2D, v10.2S, v1.2S + mul v26.2s, v14.2s, v31.2s + umlal v17.2d, v10.2s, v1.2s ldr x15, [tmpb+16] - umlal v17.2D, v6.2S, v20.2S + umlal v17.2d, v6.2s, v20.2s and x9, x30, #0x3ffffff bfi x9, x17, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE aa add x17, x2, x2 lsr x10, x15, #32 add x27, x25, x1, lsr #26 - umlal v25.2D, v7.2S, v20.2S + umlal v25.2d, v7.2s, v20.2s add x13, x10, x10 - umlal v25.2D, v19.2S, v13.2S + umlal v25.2d, v19.2s, v13.2s add x29, x23, x27, lsr #25 - umlal v25.2D, v15.2S, v11.2S + umlal v25.2d, v15.2s, v11.2s lsr x30, x8, #32 - umlal v25.2D, v4.2S, v18.2S + umlal v25.2d, v4.2s, v18.2s add x23, x5, x29, lsr #26 - umlal v25.2D, v10.2S, v23.2S + umlal v25.2d, v10.2s, v23.2s and x14, x29, #0x3ffffff - umlal v25.2D, v6.2S, v26.2S + umlal v25.2d, v6.2s, v26.2s add x5, x16, x23, lsr #25 - shl v8.2D, v17.2D, #1 + shl v8.2d, v17.2d, #1 umaddl x12, w2, w17, x11 and x29, x5, #0x3ffffff umull x21, w19, w19 - umlal v29.2D, v7.2S, v26.2S + umlal v29.2d, v7.2s, v26.2s add w16, w10, w10, lsl #1; - umlal v3.2D, v16.2S, v26.2S + umlal v3.2d, v16.2s, v26.2s add w16, w16, w10, lsl #4 bfi x14, x23, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE aa add w10, w24, w24, lsl #1; add x22, x26, x5, lsr #26 add w10, w10, w24, lsl #4 - umlal v8.2D, v12.2S, v14.2S + umlal v8.2d, v12.2s, v14.2s umaddl x25, w16, w13, x21 - umlal v8.2D, v2.2S, v13.2S + umlal v8.2d, v2.2s, v13.2s bfi x29, x22, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE aa - umlal v8.2D, v5.2S, v11.2S + umlal v8.2d, v5.2s, v11.2s add x26, x24, x24 - umlal v8.2D, v16.2S, v18.2S + umlal v8.2d, v16.2s, v18.2s stp x14, x29, [tmpa+16] - umlal v8.2D, v7.2S, v23.2S + umlal v8.2d, v7.2s, v23.2s add w24, w30, w30, lsl #1; - usra v25.2D, v29.2D, #26 + usra v25.2d, v29.2d, #26 add w24, w24, w30, lsl #4 umull x29, w15, w15 - umlal v27.2D, v2.2S, v14.2S + umlal v27.2d, v2.2s, v14.2s umull x3, w15, w13 - umlal v27.2D, v5.2S, v13.2S + umlal v27.2d, v5.2s, v13.2s add x21, x20, x20 - umlal v24.2D, v15.2S, v14.2S + umlal v24.2d, v15.2s, v14.2s umull x5, w19, w21 - umlal v24.2D, v4.2S, v13.2S + umlal v24.2d, v4.2s, v13.2s and x11, x1, #0x3ffffff - usra v8.2D, v25.2D, #25 + usra v8.2d, v25.2d, #25 and x1, x0, #0x1ffffff - umlal v27.2D, v16.2S, v11.2S + umlal v27.2d, v16.2s, v11.2s umaddl x23, w17, w13, x5 - umlal v27.2D, v7.2S, v18.2S + umlal v27.2d, v7.2s, v18.2s add x5, x30, x30 - usra v0.2D, v8.2D, #26 + usra v0.2d, v8.2d, #26 add x0, x15, x15 - umlal v24.2D, v10.2S, v11.2S + umlal v24.2d, v10.2s, v11.2s umaddl x23, w7, w0, x23 - umlal v24.2D, v6.2S, v18.2S + umlal v24.2d, v6.2s, v18.2s lsr x30, x7, #32 - usra v27.2D, v0.2D, #25 + usra v27.2d, v0.2d, #25 add x16, x30, x30 - and v20.16B, v8.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = bc|ad + and v20.16b, v8.16b, v30.16b // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = bc|ad umaddl x15, w30, w16, x23 - ushr v23.2D, v30.2D, #1 + ushr v23.2d, v30.2d, #1 add w23, w8, w8, lsl #1; - usra v24.2D, v27.2D, #26 + usra v24.2d, v27.2d, #26 add w23, w23, w8, lsl #4 umaddl x14, w19, w5, x3 - and v8.16B, v27.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = bc|ad + and v8.16b, v27.16b, v30.16b // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = bc|ad add x28, x8, x8 - and v27.16B, v0.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = bc|ad + and v27.16b, v0.16b, v23.16b // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = bc|ad umaddl x8, w8, w23, x15 - and v5.16B, v24.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = bc|ad + and v5.16b, v24.16b, v23.16b // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = bc|ad umaddl x3, w2, w28, x14 - umlal v22.2D, v15.2S, v28.2S + umlal v22.2d, v15.2s, v28.2s bfi x11, x27, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE aa - uzp1 v5.4S, v8.4S, v5.4S + uzp1 v5.4s, v8.4s, v5.4s umaddl x14, w24, w5, x29 umaddl x5, w19, w28, x14 ldr d18, [mask1] mov v18.d[1], v18.d[0] umaddl x15, w7, w26, x3 - mul v12.2S, v13.2S, v31.2S - umlal v21.2D, v16.2S, v13.2S + mul v12.2s, v13.2s, v31.2s + umlal v21.2d, v16.2s, v13.2s stp x9, x11, [tmpa+0] - umlal v21.2D, v7.2S, v11.2S + umlal v21.2d, v7.2s, v11.2s umaddl x29, w17, w26, x5 - umlal v22.2D, v4.2S, v14.2S + umlal v22.2d, v4.2s, v14.2s add w14, w20, w20, lsl #1; - umlal v22.2D, v10.2S, v13.2S + umlal v22.2d, v10.2s, v13.2s add w14, w14, w20, lsl #4 umull x3, w19, w0 - umlal v22.2D, v6.2S, v11.2S + umlal v22.2d, v6.2s, v11.2s umaddl x29, w7, w21, x29 - usra v21.2D, v24.2D, #25 + usra v21.2d, v24.2d, #25 umaddl x11, w20, w14, x12 - and v0.16B, v25.16B, v23.16B + and v0.16b, v25.16b, v23.16b umaddl x5, w30, w21, x15 - and v14.16B, v29.16B, v30.16B + and v14.16b, v29.16b, v30.16b umaddl x12, w16, w13, x29 - usra v22.2D, v21.2D, #26 + usra v22.2d, v21.2d, #26 umaddl x29, w17, w16, x3 - umlal v3.2D, v7.2S, v12.2S + umlal v3.2d, v7.2s, v12.2s add x9, x26, x26 - and v1.16B, v21.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = bc|ad + and v1.16b, v21.16b, v30.16b // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = bc|ad add x27, x5, x12, lsr #26 - bic v8.16B, v22.16B, v23.16B + bic v8.16b, v22.16b, v23.16b umaddl x29, w7, w7, x29 - and v17.16B, v22.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = bc|ad + and v17.16b, v22.16b, v23.16b // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = bc|ad add x5, x25, x27, lsr #25 - usra v3.2D, v8.2D, #25 + usra v3.2d, v8.2d, #25 umaddl x25, w24, w9, x8 - umlal v9.2D, v10.2S, v26.2S + umlal v9.2d, v10.2s, v26.2s add x8, x13, x13 - trn1 v22.4S, v1.4S, v17.4S + trn1 v22.4s, v1.4s, v17.4s umaddl x11, w10, w8, x11 - usra v3.2D, v8.2D, #24 + usra v3.2d, v8.2d, #24 umull x20, w19, w16 - add v26.2S, v22.2S, v18.2S + add v26.2s, v22.2s, v18.2s ldr d28, [mask2] - umlal v9.2D, v6.2S, v12.2S + umlal v9.2d, v6.2s, v12.2s umaddl x3, w23, w0, x11 - usra v3.2D, v8.2D, #21 + usra v3.2d, v8.2d, #21 umaddl x29, w10, w26, x29 - uzp1 v11.4S, v20.4S, v27.4S + uzp1 v11.4s, v20.4s, v27.4s umaddl x20, w2, w4, x20 umaddl x9, w10, w21, x20 mov v17.d[0], v22.d[1] - usra v9.2D, v3.2D, #26 + usra v9.2d, v3.2d, #26 umull x15, w19, w13 - and v7.16B, v3.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = bc|ad + and v7.16b, v3.16b, v30.16b // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = bc|ad add x11, x16, x16 - uzp2 v1.4S, v11.4S, v5.4S + uzp2 v1.4s, v11.4s, v5.4s umaddl x20, w23, w13, x9 - and v8.16B, v9.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = bc|ad + and v8.16b, v9.16b, v23.16b // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = bc|ad umaddl x9, w2, w0, x15 - usra v14.2D, v9.2D, #25 + usra v14.2d, v9.2d, #25 and x6, x6, #0x3ffffff - uzp1 v7.4S, v7.4S, v8.4S + uzp1 v7.4s, v7.4s, v8.4s umaddl x29, w23, w21, x29 - uzp1 v27.4S, v11.4S, v5.4S + uzp1 v27.4s, v11.4s, v5.4s umull x15, w19, w26 - usra v0.2D, v14.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = bc|ad + usra v0.2d, v14.2d, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = bc|ad add x6, x6, x22, lsr #25 - and v3.16B, v14.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = bc|ad + and v3.16b, v14.16b, v30.16b // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = bc|ad bic x22, x27, #0x1ffffff - sub v2.2S, v26.2S, v17.2S - add v9.2S, v22.2S, v17.2S - uzp1 v14.4S, v3.4S, v0.4S + sub v2.2s, v26.2s, v17.2s + add v9.2s, v22.2s, v17.2s + uzp1 v14.4s, v3.4s, v0.4s umaddl x2, w2, w21, x15 - add v5.4S, v27.4S, v18.4S + add v5.4s, v27.4s, v18.4s add x5, x5, x22, lsr #24 - zip1 v22.2S, v2.2S, v9.2S // ubignum_of_h32reglist 8 + ubignum_of_l32reglist 8 // INTERMEDIATE H|L = t1|t2 + zip1 v22.2s, v2.2s, v9.2s // ubignum_of_h32reglist 8 + ubignum_of_l32reglist 8 // INTERMEDIATE H|L = t1|t2 mov v18.b[0], v28.b[0] - uzp1 v8.4S, v7.4S, v14.4S + uzp1 v8.4s, v7.4s, v14.4s add x22, x5, x22, lsr #21 - uzp2 v3.4S, v7.4S, v14.4S + uzp2 v3.4s, v7.4s, v14.4s umaddl x5, w7, w16, x9 - add v25.4S, v8.4S, v18.4S + add v25.4s, v8.4s, v18.4s umaddl x15, w14, w0, x22 - add v12.4S, v27.4S, v1.4S + add v12.4s, v27.4s, v1.4s add x9, x17, x17 - sub v14.4S, v5.4S, v1.4S + sub v14.4s, v5.4s, v1.4s umull x19, w19, w17 - sub v18.4S, v25.4S, v3.4S + sub v18.4s, v25.4s, v3.4s ldr x22, [tmpa+8] - add v20.4S, v8.4S, v3.4S + add v20.4s, v8.4s, v3.4s umaddl x15, w10, w11, x15 - zip1 v16.4S, v14.4S, v12.4S // ubignum_of_h32reglist 4 + ubignum_of_l32reglist 4 // INTERMEDIATE H|L = t1|t2 + zip1 v16.4s, v14.4s, v12.4s // ubignum_of_h32reglist 4 + ubignum_of_l32reglist 4 // INTERMEDIATE H|L = t1|t2 umaddl x14, w14, w13, x19 - zip2 v14.4S, v14.4S, v12.4S // ubignum_of_h32reglist 6 + ubignum_of_l32reglist 6 // INTERMEDIATE H|L = t1|t2 + zip2 v14.4s, v14.4s, v12.4s // ubignum_of_h32reglist 6 + ubignum_of_l32reglist 6 // INTERMEDIATE H|L = t1|t2 and x17, x27, #0x1ffffff - zip2 v0.4S, v18.4S, v20.4S // ubignum_of_h32reglist 2 + ubignum_of_l32reglist 2 // INTERMEDIATE H|L = t1|t2 + zip2 v0.4s, v18.4s, v20.4s // ubignum_of_h32reglist 2 + ubignum_of_l32reglist 2 // INTERMEDIATE H|L = t1|t2 umaddl x15, w23, w4, x15 - zip1 v1.4S, v18.4S, v20.4S // ubignum_of_h32reglist 0 + ubignum_of_l32reglist 0 // INTERMEDIATE H|L = t1|t2 + zip1 v1.4s, v18.4s, v20.4s // ubignum_of_h32reglist 0 + ubignum_of_l32reglist 0 // INTERMEDIATE H|L = t1|t2 umaddl x10, w10, w0, x14 - zip2 v5.2S, v2.2S, v9.2S // ubignum_of_h32reglist 9 + ubignum_of_l32reglist 9 // INTERMEDIATE H|L = t1|t2 - shl v24.2S, v0.2S, #1 + zip2 v5.2s, v2.2s, v9.2s // ubignum_of_h32reglist 9 + ubignum_of_l32reglist 9 // INTERMEDIATE H|L = t1|t2 + shl v24.2s, v0.2s, #1 mov v19.d[0], v1.d[1] // ubignum_of_h32reglist 1 + ubignum_of_l32reglist 1 // INTERMEDIATE H|L = t1|t2 - shl v26.2S, v22.2S, #1 - shl v17.2S, v16.2S, #1 + shl v26.2s, v22.2s, #1 + shl v17.2s, v16.2s, #1 mov v15.d[0], v0.d[1] // ubignum_of_h32reglist 3 + ubignum_of_l32reglist 3 // INTERMEDIATE H|L = t1|t2 - shl v7.2S, v5.2S, #1 - shl v18.2S, v19.2S, #1 - umull v11.2D, v1.2S, v24.2S + shl v7.2s, v5.2s, #1 + shl v18.2s, v19.2s, #1 + umull v11.2d, v1.2s, v24.2s umaddl x19, w23, w16, x10 - umull v6.2D, v1.2S, v17.2S + umull v6.2d, v1.2s, v17.2s umaddl x10, w7, w13, x2 mov v4.d[0], v16.d[1] // ubignum_of_h32reglist 5 + ubignum_of_l32reglist 5 // INTERMEDIATE H|L = t1|t2 mov v10.d[0], v14.d[1] // ubignum_of_h32reglist 7 + ubignum_of_l32reglist 7 // INTERMEDIATE H|L = t1|t2 - umull v9.2D, v1.2S, v26.2S + umull v9.2d, v1.2s, v26.2s ldr x13, [tmpa+0] - shl v28.2S, v15.2S, #1 - shl v3.2S, v10.2S, #1 + shl v28.2s, v15.2s, #1 + shl v3.2s, v10.2s, #1 ldr x14, [tmpa+16] - mul v12.2S, v10.2S, v31.2S - umull v25.2D, v1.2S, v7.2S + mul v12.2s, v10.2s, v31.2s + umull v25.2d, v1.2s, v7.2s ldr x2, [tmpa+24] - umlal v6.2D, v18.2S, v28.2S + umlal v6.2d, v18.2s, v28.2s umaddl x27, w30, w0, x10 umaddl x16, w24, w0, x20 - shl v13.2S, v14.2S, #1 + shl v13.2s, v14.2s, #1 umaddl x5, w23, w26, x5 - mul v2.2S, v22.2S, v31.2S - umull v21.2D, v1.2S, v13.2S + mul v2.2s, v22.2s, v31.2s + umull v21.2d, v1.2s, v13.2s umaddl x23, w24, w8, x29 - umlal v11.2D, v18.2S, v19.2S + umlal v11.2d, v18.2s, v19.2s mov x10, #0x07fffffe07fffffe sub x10, x10, #2 umaddl x26, w24, w21, x5 - mul v29.2S, v14.2S, v31.2S - umlal v25.2D, v19.2S, v26.2S + mul v29.2s, v14.2s, v31.2s + umlal v25.2d, v19.2s, v26.2s add x7, x1, x6, lsr #26 - mul v20.2S, v4.2S, v31.2S + mul v20.2s, v4.2s, v31.2s and x6, x6, #0x3ffffff - shl v8.2S, v18.2S, #1 - shl v4.2S, v4.2S, #1 - umlal v11.2D, v29.2S, v14.2S + shl v8.2s, v18.2s, #1 + shl v4.2s, v4.2s, #1 + umlal v11.2d, v29.2s, v14.2s bfi x6, x7, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE aa - umlal v25.2D, v0.2S, v3.2S + umlal v25.2d, v0.2s, v3.2s umaddl x0, w24, w4, x19 - umlal v25.2D, v15.2S, v13.2S + umlal v25.2d, v15.2s, v13.2s str x6, [tmpa+32] - umlal v21.2D, v18.2S, v4.2S + umlal v21.2d, v18.2s, v4.2s umaddl x8, w24, w11, x3 - umlal v21.2D, v0.2S, v17.2S + umlal v21.2d, v0.2s, v17.2s ldr x30, [tmpa+32] - mul v14.2S, v5.2S, v31.2S + mul v14.2s, v5.2s, v31.2s add x2, x2, x10 - shl v5.2S, v28.2S, #1 - shl v27.2S, v4.2S, #1 - umlal v6.2D, v0.2S, v0.2S + shl v5.2s, v28.2s, #1 + shl v27.2s, v4.2s, #1 + umlal v6.2d, v0.2s, v0.2s umaddl x11, w24, w9, x15 - umlal v6.2D, v12.2S, v3.2S + umlal v6.2d, v12.2s, v3.2s add x4, x30, x10 - umlal v11.2D, v14.2S, v5.2S + umlal v11.2d, v14.2s, v5.2s add x3, x22, x10 - umlal v11.2D, v2.2S, v17.2S + umlal v11.2d, v2.2s, v17.2s add x6, x0, x11, lsr #26 - umlal v11.2D, v12.2S, v27.2S + umlal v11.2d, v12.2s, v27.2s add x14, x14, x10 - umlal v6.2D, v14.2S, v27.2S + umlal v6.2d, v14.2s, v27.2s add x8, x8, x6, lsr #25 - umlal v6.2D, v2.2S, v13.2S + umlal v6.2d, v2.2s, v13.2s movk x10, #0xffb4 - umlal v25.2D, v16.2S, v4.2S + umlal v25.2d, v16.2s, v4.2s add x29, x16, x8, lsr #26 - umull v27.2D, v1.2S, v3.2S + umull v27.2d, v1.2s, v3.2s and x11, x11, #0x3ffffff - umlal v9.2D, v18.2S, v3.2S + umlal v9.2d, v18.2s, v3.2s add x19, x13, x10 - umlal v9.2D, v0.2S, v13.2S + umlal v9.2d, v0.2s, v13.2s and x5, x8, #0x3ffffff - umlal v9.2D, v28.2S, v4.2S + umlal v9.2d, v28.2s, v4.2s bfi x11, x6, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE bb - umlal v9.2D, v16.2S, v16.2S + umlal v9.2d, v16.2s, v16.2s umaddl x30, w24, w28, x27 - umlal v9.2D, v14.2S, v7.2S + umlal v9.2d, v14.2s, v7.2s sub x13, x19, x11 - umull v10.2D, v1.2S, v18.2S + umull v10.2d, v1.2s, v18.2s add x7, x23, x29, lsr #25 - umlal v21.2D, v28.2S, v15.2S + umlal v21.2d, v28.2s, v15.2s lsr x16, x13, #32 // ubignum_of_wreglist 1 + ubignum_of_wreglist 0 // INTERMEDIATE e - umlal v21.2D, v2.2S, v22.2S + umlal v21.2d, v2.2s, v22.2s add x0, x26, x7, lsr #26 - usra v25.2D, v9.2D, #26 + usra v25.2d, v9.2d, #26 and x20, x7, #0x3ffffff - umull v22.2D, v1.2S, v1.2S + umull v22.2d, v1.2s, v1.2s add x8, x25, x0, lsr #25 - umull v7.2D, v1.2S, v28.2S + umull v7.2d, v1.2s, v28.2s and x1, x29, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bbalt - bic v18.16B, v25.16B, v23.16B + bic v18.16b, v25.16b, v23.16b and x19, x8, #0x3ffffff - and v16.16B, v9.16B, v30.16B + and v16.16b, v9.16b, v30.16b and x7, x12, #0x3ffffff - usra v22.2D, v18.2D, #25 + usra v22.2d, v18.2d, #25 add x10, x30, x8, lsr #26 - umlal v7.2D, v19.2S, v24.2S + umlal v7.2d, v19.2s, v24.2s bfi x5, x29, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE bb - and v9.16B, v25.16B, v23.16B + and v9.16b, v25.16b, v23.16b add x27, x7, x10, lsr #25 - usra v22.2D, v18.2D, #24 + usra v22.2d, v18.2d, #24 mov x21, #60833 lsl x21, x21, #1 add x15, x17, x27, lsr #26 - shl v25.2S, v3.2S, #1 - umlal v7.2D, v14.2S, v17.2S + shl v25.2s, v3.2s, #1 + umlal v7.2d, v14.2s, v17.2s and x29, x27, #0x3ffffff - usra v22.2D, v18.2D, #21 + usra v22.2d, v18.2d, #21 bfi x29, x15, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE bb // ***SOURCE*** ubignum_of_xreglist 9 // INTERMEDIATE bbalt - umlal v10.2D, v14.2S, v24.2S + umlal v10.2d, v14.2s, v24.2s and x17, x6, #0x1ffffff // ubignum_of_xreglist 1 // INTERMEDIATE bbalt - umlal v10.2D, v2.2S, v28.2S + umlal v10.2d, v2.2s, v28.2s sub x6, x3, x5 - umlal v10.2D, v12.2S, v17.2S + umlal v10.2d, v12.2s, v17.2s umaddl x25, w16, w21, x17 - umlal v10.2D, v29.2S, v4.2S + umlal v10.2d, v29.2s, v4.2s mov w12, w5 // ubignum_of_xreglist 2 // INTERMEDIATE bbalt - umlal v22.2D, v20.2S, v4.2S + umlal v22.2d, v20.2s, v4.2s lsr x26, x6, #32 // ubignum_of_wreglist 3 + ubignum_of_wreglist 2 // INTERMEDIATE e - umlal v22.2D, v14.2S, v8.2S + umlal v22.2d, v14.2s, v8.2s and x24, x0, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bbalt - umlal v22.2D, v2.2S, v24.2S + umlal v22.2d, v2.2s, v24.2s stp x11, x5, [tmpb+0] - umlal v22.2D, v12.2S, v5.2S + umlal v22.2d, v12.2s, v5.2s bfi x20, x0, #32, #25 // ubignum_of_preglist 2 // INTERMEDIATE bb - umlal v22.2D, v29.2S, v17.2S + umlal v22.2d, v29.2s, v17.2s umaddl x12, w6, w21, x12 - umull v18.2D, v1.2S, v4.2S + umull v18.2d, v1.2s, v4.2s bfi x19, x10, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE bb - umlal v7.2D, v2.2S, v4.2S + umlal v7.2d, v2.2s, v4.2s sub x7, x14, x20 - umlal v27.2D, v19.2S, v13.2S + umlal v27.2d, v19.2s, v13.2s mov w8, w20 // ubignum_of_xreglist 4 // INTERMEDIATE bbalt - usra v10.2D, v22.2D, #26 + usra v10.2d, v22.2d, #26 lsr x14, x7, #32 // ubignum_of_wreglist 5 + ubignum_of_wreglist 4 // INTERMEDIATE e - umlal v18.2D, v19.2S, v17.2S + umlal v18.2d, v19.2s, v17.2s and x28, x10, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bbalt - umlal v7.2D, v12.2S, v13.2S + umlal v7.2d, v12.2s, v13.2s sub x5, x2, x19 - usra v11.2D, v10.2D, #25 + usra v11.2d, v10.2d, #25 mov w2, w19 // ubignum_of_xreglist 6 // INTERMEDIATE bbalt - umlal v27.2D, v0.2S, v4.2S - umlal v21.2D, v14.2S, v25.2S + umlal v27.2d, v0.2s, v4.2s + umlal v21.2d, v14.2s, v25.2s sub x23, x4, x29 - usra v7.2D, v11.2D, #26 + usra v7.2d, v11.2d, #26 mov w0, w29 // ubignum_of_xreglist 8 // INTERMEDIATE bbalt - umlal v18.2D, v0.2S, v28.2S + umlal v18.2d, v0.2s, v28.2s lsr x22, x23, #32 // ubignum_of_wreglist 9 + ubignum_of_wreglist 8 // INTERMEDIATE e - umlal v27.2D, v15.2S, v17.2S + umlal v27.2d, v15.2s, v17.2s str x29, [tmpb+32] - usra v6.2D, v7.2D, #25 + usra v6.2d, v7.2d, #25 mov w17, w11 // ubignum_of_xreglist 0 // INTERMEDIATE bbalt - and v0.16B, v22.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x5|t3 + and v0.16b, v22.16b, v30.16b // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x5|t3 umaddl x27, w26, w21, x1 - umlal v18.2D, v14.2S, v13.2S + umlal v18.2d, v14.2s, v13.2s umaddl x30, w23, w21, x0 - umlal v18.2D, v2.2S, v3.2S + umlal v18.2d, v2.2s, v3.2s lsr x10, x5, #32 // ubignum_of_wreglist 7 + ubignum_of_wreglist 6 // INTERMEDIATE e - and v4.16B, v6.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x5|t3 - and v1.16B, v10.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x5|t3 + and v4.16b, v6.16b, v30.16b // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x5|t3 + and v1.16b, v10.16b, v23.16b // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x5|t3 umaddl x4, w14, w21, x24 ldr x0, [tmpa+0] mov v0.s[1], w0 lsr x0, x0, #32 mov v1.s[1], w0 umaddl x9, w7, w21, x8 - usra v18.2D, v6.2D, #26 + usra v18.2d, v6.2d, #26 umaddl x24, w10, w21, x28 - and v3.16B, v7.16B, v23.16B // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x5|t3 + and v3.16b, v7.16b, v23.16b // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x5|t3 umaddl x8, w22, w21, x15 - umlal v27.2D, v14.2S, v26.2S + umlal v27.2d, v14.2s, v26.2s umaddl x15, w13, w21, x17 - usra v21.2D, v18.2D, #25 + usra v21.2d, v18.2d, #25 stp x20, x19, [tmpb+16] - and v2.16B, v11.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x5|t3 + and v2.16b, v11.16b, v30.16b // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x5|t3 lsr x29, x8, #25 ldr x3, [tmpb+0] mov v10.s[1], w3 lsr x3, x3, #32 mov v11.s[1], w3 add x17, x15, x29 - usra v27.2D, v21.2D, #26 + usra v27.2d, v21.2d, #26 add x28, x17, x29, lsl #1 - and v6.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x5|t3 + and v6.16b, v21.16b, v30.16b // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x5|t3 and x20, x8, #0x1ffffff - and v5.16B, v18.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x5|t3 + and v5.16b, v18.16b, v23.16b // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x5|t3 add x17, x28, x29, lsl #4 - and v7.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x5|t3 + and v7.16b, v27.16b, v23.16b // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x5|t3 ldr x3, [tmpb+8] mov v22.s[1], w3 lsr x3, x3, #32 @@ -990,7 +990,7 @@ curve25519_x25519_byte_scalarloop: lsr x15, x15, #32 mov v11.s[0], w15 and x11, x17, #0x3ffffff // ubignum_of_xreglist 0 // INTERMEDIATE bce - usra v16.2D, v27.2D, #25 + usra v16.2d, v27.2d, #25 add x8, x12, x29, lsr #25 ldr x3, [tmpb+16] mov v14.s[1], w3 @@ -1002,7 +1002,7 @@ curve25519_x25519_byte_scalarloop: lsr x15, x15, #32 mov v23.s[0], w15 add x28, x27, x8, lsr #26 - and v8.16B, v16.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + and v8.16b, v16.16b, v30.16b // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 umull x1, w12, w10 ldr x3, [tmpb+24] mov v17.s[1], w3 @@ -1014,7 +1014,7 @@ curve25519_x25519_byte_scalarloop: lsr x15, x15, #32 mov v15.s[0], w15 umaddl x19, w5, w21, x2 - usra v9.2D, v16.2D, #26 // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 + usra v9.2d, v16.2d, #26 // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x5|t3 add x2, x4, x25, lsr #26 ldr x3, [tmpb+32] mov v24.s[1], w3 @@ -1026,7 +1026,7 @@ curve25519_x25519_byte_scalarloop: lsr x15, x15, #32 mov v18.s[0], w15 add x29, x19, x2, lsr #25 - umull v26.2D, v0.2S, v23.2S + umull v26.2d, v0.2s, v23.2s and x21, x28, #0x1ffffff // ubignum_of_xreglist 3 // INTERMEDIATE bce ldr x0, [tmpa+8] mov v2.s[1], w0 @@ -1038,20 +1038,20 @@ curve25519_x25519_byte_scalarloop: lsr x15, x15, #32 mov v25.s[0], w15 add x17, x24, x29, lsr #26 - umull v29.2D, v1.2S, v18.2S + umull v29.2d, v1.2s, v18.2s and x15, x8, #0x3ffffff // ubignum_of_xreglist 2 // INTERMEDIATE bce - umull v20.2D, v0.2S, v15.2S + umull v20.2d, v0.2s, v15.2s add x19, x30, x17, lsr #25 and x3, x17, #0x1ffffff // ubignum_of_xreglist 7 // INTERMEDIATE bce - mul v12.2S, v25.2S, v31.2S + mul v12.2s, v25.2s, v31.2s ldr x0, [tmpa+16] mov v4.s[1], w0 lsr x0, x0, #32 mov v5.s[1], w0 add x4, x20, x19, lsr #26 // ubignum_of_xreglist 9 // INTERMEDIATE bce - umlal v26.2D, v2.2S, v11.2S + umlal v26.2d, v2.2s, v11.2s add w28, w3, w3, lsl #1; - umlal v20.2D, v2.2S, v23.2S + umlal v20.2d, v2.2s, v23.2s add w28, w28, w3, lsl #4 umull x8, w12, w5 ldr x0, [tmpa+24] @@ -1059,12 +1059,12 @@ curve25519_x25519_byte_scalarloop: lsr x0, x0, #32 mov v7.s[1], w0 and x30, x25, #0x3ffffff // ubignum_of_xreglist 4 // INTERMEDIATE bce - mul v16.2S, v18.2S, v31.2S + mul v16.2s, v18.2s, v31.2s add w17, w4, w4, lsl #1; - umull v21.2D, v1.2S, v15.2S + umull v21.2d, v1.2s, v15.2s add w17, w17, w4, lsl #4 umaddl x25, w21, w7, x8 - umlal v20.2D, v4.2S, v11.2S + umlal v20.2d, v4.2s, v11.2s add w8, w21, w21, lsl #1; ldr x0, [tmpa+32] add w8, w8, w21, lsl #4 @@ -1072,300 +1072,300 @@ curve25519_x25519_byte_scalarloop: lsr x0, x0, #32 mov v9.s[1], w0 and x2, x2, #0x1ffffff // ubignum_of_xreglist 5 // INTERMEDIATE bce - umlal v29.2D, v3.2S, v15.2S + umlal v29.2d, v3.2s, v15.2s umaddl x24, w2, w6, x25 - umull v13.2D, v0.2S, v25.2S + umull v13.2d, v0.2s, v25.2s umaddl x25, w2, w7, x27 umaddl x0, w3, w6, x25 - mul v19.2S, v15.2S, v31.2S - umull v27.2D, v0.2S, v18.2S + mul v19.2s, v15.2s, v31.2s + umull v27.2d, v0.2s, v18.2s umaddl x20, w3, w13, x24 - umlal v20.2D, v6.2S, v12.2S + umlal v20.2d, v6.2s, v12.2s umaddl x24, w21, w14, x1 - umlal v13.2D, v2.2S, v18.2S + umlal v13.2d, v2.2s, v18.2s umaddl x9, w4, w13, x0 - umull v25.2D, v0.2S, v11.2S + umull v25.2d, v0.2s, v11.2s umaddl x20, w17, w23, x20 - umlal v27.2D, v2.2S, v15.2S + umlal v27.2d, v2.2s, v15.2s umaddl x0, w2, w26, x24 - umull v28.2D, v1.2S, v11.2S + umull v28.2d, v1.2s, v11.2s umull x24, w17, w5 - umlal v29.2D, v5.2S, v23.2S + umlal v29.2d, v5.2s, v23.2s umaddl x9, w11, w22, x9 - umlal v13.2D, v4.2S, v15.2S + umlal v13.2d, v4.2s, v15.2s umaddl x27, w3, w16, x0 - umlal v27.2D, v4.2S, v23.2S + umlal v27.2d, v4.2s, v23.2s umull x0, w17, w14 - umlal v27.2D, v6.2S, v11.2S + umlal v27.2d, v6.2s, v11.2s umull x4, w12, w14 - umlal v27.2D, v8.2S, v12.2S + umlal v27.2d, v8.2s, v12.2s umaddl x25, w11, w10, x20 - umlal v27.2D, v1.2S, v17.2S + umlal v27.2d, v1.2s, v17.2s umaddl x0, w28, w10, x0 - umlal v13.2D, v6.2S, v23.2S + umlal v13.2d, v6.2s, v23.2s umull x3, w17, w6 - umlal v13.2D, v8.2S, v11.2S + umlal v13.2d, v8.2s, v11.2s umaddl x1, w21, w26, x4 - umlal v20.2D, v8.2S, v16.2S + umlal v20.2d, v8.2s, v16.2s umaddl x4, w2, w13, x24 - umlal v28.2D, v3.2S, v12.2S + umlal v28.2d, v3.2s, v12.2s umaddl x20, w28, w7, x3 - umlal v29.2D, v7.2S, v11.2S + umlal v29.2d, v7.2s, v11.2s and x3, x19, #0x3ffffff // ubignum_of_xreglist 9 // INTERMEDIATE bce - umlal v29.2D, v9.2S, v12.2S + umlal v29.2d, v9.2s, v12.2s umaddl x19, w17, w22, x27 add w27, w2, w2, lsl #1; - mul v18.2S, v24.2S, v31.2S + mul v18.2s, v24.2s, v31.2s add w27, w27, w2, lsl #4 - umlal v21.2D, v3.2S, v23.2S + umlal v21.2d, v3.2s, v23.2s umull x24, w17, w7 - umlal v13.2D, v1.2S, v24.2S + umlal v13.2d, v1.2s, v24.2s add x19, x19, x19 - shl v29.2D, v29.2D, #1 + shl v29.2d, v29.2d, #1 umaddl x1, w2, w16, x1 - umull v15.2D, v1.2S, v23.2S + umull v15.2d, v1.2s, v23.2s umaddl x0, w27, w22, x0 - umlal v29.2D, v0.2S, v24.2S + umlal v29.2d, v0.2s, v24.2s umaddl x2, w28, w5, x24 - mul v24.2S, v23.2S, v31.2S + mul v24.2s, v23.2s, v31.2s umaddl x4, w28, w23, x4 - umlal v21.2D, v5.2S, v11.2S + umlal v21.2d, v5.2s, v11.2s umaddl x24, w27, w5, x20 - umlal v20.2D, v1.2S, v14.2S + umlal v20.2d, v1.2s, v14.2s umaddl x20, w11, w23, x19 - umlal v26.2D, v4.2S, v12.2S + umlal v26.2d, v4.2s, v12.2s umaddl x19, w27, w23, x2 - umlal v26.2D, v6.2S, v16.2S + umlal v26.2d, v6.2s, v16.2s umaddl x2, w21, w6, x4 - umlal v29.2D, v2.2S, v17.2S + umlal v29.2d, v2.2s, v17.2s umaddl x24, w8, w23, x24 - umlal v15.2D, v3.2S, v11.2S + umlal v15.2d, v3.2s, v11.2s umaddl x0, w21, w16, x0 umaddl x4, w21, w13, x19 - mul v23.2S, v11.2S, v31.2S - umlal v20.2D, v3.2S, v22.2S + mul v23.2s, v11.2s, v31.2s + umlal v20.2d, v3.2s, v22.2s umaddl x2, w12, w7, x2 - umlal v20.2D, v5.2S, v10.2S + umlal v20.2d, v5.2s, v10.2s umaddl x19, w12, w26, x0 - umlal v29.2D, v4.2S, v14.2S + umlal v29.2d, v4.2s, v14.2s umaddl x0, w12, w13, x24 - umlal v26.2D, v8.2S, v19.2S + umlal v26.2d, v8.2s, v19.2s umaddl x20, w15, w5, x20 - umlal v26.2D, v1.2S, v22.2S + umlal v26.2d, v1.2s, v22.2s umaddl x21, w15, w10, x9 - umlal v26.2D, v3.2S, v10.2S + umlal v26.2d, v3.2s, v10.2s and x9, x29, #0x3ffffff // ubignum_of_xreglist 6 // INTERMEDIATE bce - umlal v29.2D, v6.2S, v22.2S + umlal v29.2d, v6.2s, v22.2s umaddl x20, w30, w7, x20 umaddl x1, w28, w22, x1 add x24, x19, x19 - umull v11.2D, v1.2S, v12.2S + umull v11.2d, v1.2s, v12.2s add w19, w3, w3, lsl #1; - umlal v26.2D, v5.2S, v18.2S + umlal v26.2d, v5.2s, v18.2s add w19, w19, w3, lsl #4 umaddl x20, w9, w6, x20 - umlal v29.2D, v8.2S, v10.2S + umlal v29.2d, v8.2s, v10.2s add w29, w9, w9, lsl #1; - umlal v13.2D, v3.2S, v17.2S + umlal v13.2d, v3.2s, v17.2s add w29, w29, w9, lsl #4 umaddl x2, w19, w10, x2 - umlal v11.2D, v3.2S, v16.2S + umlal v11.2d, v3.2s, v16.2s umaddl x21, w30, w14, x21 - umlal v11.2D, v5.2S, v19.2S + umlal v11.2d, v5.2s, v19.2s umaddl x20, w3, w13, x20 - umlal v11.2D, v7.2S, v24.2S + umlal v11.2d, v7.2s, v24.2s umaddl x2, w29, w22, x2 - umlal v11.2D, v9.2S, v23.2S + umlal v11.2d, v9.2s, v23.2s umaddl x21, w9, w26, x21 - ushr v23.2D, v30.2D, #1 + ushr v23.2d, v30.2d, #1 umaddl x1, w17, w10, x1 - umlal v13.2D, v5.2S, v14.2S + umlal v13.2d, v5.2s, v14.2s umaddl x24, w19, w5, x24 - umlal v27.2D, v3.2S, v14.2S + umlal v27.2d, v3.2s, v14.2s umaddl x21, w3, w16, x21 - shl v11.2D, v11.2D, #1 + shl v11.2d, v11.2d, #1 add w3, w30, w30, lsl #1; - umlal v28.2D, v5.2S, v16.2S + umlal v28.2d, v5.2s, v16.2s add w3, w3, w30, lsl #4 umaddl x24, w29, w23, x24 - umlal v28.2D, v7.2S, v19.2S + umlal v28.2d, v7.2s, v19.2s add x1, x1, x1 - umlal v28.2D, v9.2S, v24.2S + umlal v28.2d, v9.2s, v24.2s umaddl x1, w11, w5, x1 - umlal v15.2D, v5.2S, v12.2S + umlal v15.2d, v5.2s, v12.2s umaddl x24, w30, w13, x24 - umlal v15.2D, v7.2S, v16.2S + umlal v15.2d, v7.2s, v16.2s umaddl x25, w15, w14, x25 - umlal v15.2D, v9.2S, v19.2S + umlal v15.2d, v9.2s, v19.2s umaddl x1, w15, w7, x1 - shl v28.2D, v28.2D, #1 + shl v28.2d, v28.2d, #1 umaddl x24, w15, w6, x24 - umlal v21.2D, v7.2S, v12.2S + umlal v21.2d, v7.2s, v12.2s umaddl x2, w30, w16, x2 - umlal v21.2D, v9.2S, v16.2S + umlal v21.2d, v9.2s, v16.2s umaddl x25, w30, w26, x25 - shl v15.2D, v15.2D, #1 + shl v15.2d, v15.2d, #1 umaddl x30, w30, w6, x1 - umlal v28.2D, v0.2S, v22.2S + umlal v28.2d, v0.2s, v22.2s umaddl x1, w15, w26, x2 - umlal v28.2D, v2.2S, v10.2S + umlal v28.2d, v2.2s, v10.2s umaddl x2, w9, w16, x25 - shl v21.2D, v21.2D, #1 + shl v21.2d, v21.2d, #1 umaddl x24, w11, w7, x24 - umlal v15.2D, v0.2S, v14.2S + umlal v15.2d, v0.2s, v14.2s umaddl x1, w11, w14, x1 - umlal v21.2D, v0.2S, v17.2S + umlal v21.2d, v0.2s, v17.2s umaddl x25, w9, w13, x30 - umlal v28.2D, v4.2S, v18.2S + umlal v28.2d, v4.2s, v18.2s umaddl x0, w19, w26, x0 - umlal v25.2D, v2.2S, v12.2S + umlal v25.2d, v2.2s, v12.2s add x1, x1, x24, lsr #26 - umlal v25.2D, v4.2S, v16.2S + umlal v25.2d, v4.2s, v16.2s umaddl x30, w19, w22, x2 - umlal v21.2D, v2.2S, v14.2S + umlal v21.2d, v2.2s, v14.2s umaddl x4, w12, w6, x4 - mul v14.2S, v14.2S, v31.2S + mul v14.2s, v14.2s, v31.2s umaddl x25, w19, w23, x25 and x2, x1, #0x1ffffff - mul v16.2S, v17.2S, v31.2S - umlal v25.2D, v6.2S, v19.2S + mul v16.2s, v17.2s, v31.2s + umlal v25.2d, v6.2s, v19.2s umaddl x9, w19, w14, x4 - umlal v13.2D, v7.2S, v22.2S + umlal v13.2d, v7.2s, v22.2s add x25, x25, x1, lsr #25 - umlal v21.2D, v4.2S, v22.2S + umlal v21.2d, v4.2s, v22.2s umaddl x0, w29, w14, x0 - umlal v26.2D, v7.2S, v16.2S + umlal v26.2d, v7.2s, v16.2s add x30, x30, x25, lsr #26 - umlal v26.2D, v9.2S, v14.2S + umlal v26.2d, v9.2s, v14.2s add w1, w15, w15, lsl #1; - umlal v28.2D, v6.2S, v16.2S + umlal v28.2d, v6.2s, v16.2s add w1, w1, w15, lsl #4 add x4, x20, x30, lsr #25 - umlal v28.2D, v8.2S, v14.2S + umlal v28.2d, v8.2s, v14.2s and x25, x25, #0x3ffffff - umlal v15.2D, v2.2S, v22.2S + umlal v15.2d, v2.2s, v22.2s add x21, x21, x4, lsr #26 - umlal v11.2D, v0.2S, v10.2S + umlal v11.2d, v0.2s, v10.2s bfi x25, x30, #32, #25 // ubignum_of_preglist 3 // INTERMEDIATE z4 - umlal v11.2D, v2.2S, v18.2S + umlal v11.2d, v2.2s, v18.2s bic x30, x21, #0x3ffffff - usra v26.2D, v28.2D, #26 + usra v26.2d, v28.2d, #26 lsr x20, x30, #26 - umlal v15.2D, v4.2S, v10.2S + umlal v15.2d, v4.2s, v10.2s add x20, x20, x30, lsr #25 - umlal v15.2D, v6.2S, v18.2S + umlal v15.2d, v6.2s, v18.2s umaddl x9, w29, w10, x9 - umlal v15.2D, v8.2S, v16.2S + umlal v15.2d, v8.2s, v16.2s add x30, x20, x30, lsr #22 - umlal v27.2D, v5.2S, v22.2S + umlal v27.2d, v5.2s, v22.2s umull x20, w17, w26 - umlal v20.2D, v7.2S, v18.2S + umlal v20.2d, v7.2s, v18.2s umaddl x30, w17, w16, x30 - umlal v20.2D, v9.2S, v16.2S + umlal v20.2d, v9.2s, v16.2s umaddl x17, w3, w10, x0 - usra v15.2D, v26.2D, #25 + usra v15.2d, v26.2d, #25 umaddl x0, w28, w14, x20 - umlal v27.2D, v7.2S, v10.2S + umlal v27.2d, v7.2s, v10.2s umaddl x20, w28, w26, x30 - umlal v27.2D, v9.2S, v18.2S + umlal v27.2d, v9.2s, v18.2s add w28, w12, w12, lsl #1; - usra v20.2D, v15.2D, #26 + usra v20.2d, v15.2d, #26 add w28, w28, w12, lsl #4 umaddl x30, w27, w10, x0 - and v17.16B, v15.16B, v30.16B // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x4|z5 + and v17.16b, v15.16b, v30.16b // ubignum_of_hreglist 4 + ubignum_of_lreglist 4 // INTERMEDIATE H|L = x4|z5 umaddl x27, w27, w14, x20 umaddl x0, w8, w10, x27 - mul v12.2S, v22.2S, v31.2S - and v15.16B, v20.16B, v23.16B // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x4|z5 + mul v12.2s, v22.2s, v31.2s + and v15.16b, v20.16b, v23.16b // ubignum_of_hreglist 5 + ubignum_of_lreglist 5 // INTERMEDIATE H|L = x4|z5 umaddl x14, w3, w22, x9 - umlal v21.2D, v6.2S, v10.2S + umlal v21.2d, v6.2s, v10.2s umaddl x27, w8, w22, x30 - trn1 v15.4S, v17.4S, v15.4S // FINAL z3 + trn1 v15.4s, v17.4s, v15.4s // FINAL z3 umaddl x10, w28, w22, x0 - umlal v11.2D, v4.2S, v16.2S + umlal v11.2d, v4.2s, v16.2s umaddl x30, w15, w16, x14 - and v26.16B, v26.16B, v23.16B + and v26.16b, v26.16b, v23.16b umaddl x28, w12, w16, x27 - umlal v21.2D, v8.2S, v18.2S + umlal v21.2d, v8.2s, v18.2s add x10, x10, x10 - umlal v25.2D, v8.2S, v24.2S + umlal v25.2d, v8.2s, v24.2s umaddl x20, w19, w6, x10 - umlal v25.2D, v1.2S, v10.2S + umlal v25.2d, v1.2s, v10.2s add x28, x28, x28 - umlal v25.2D, v3.2S, v18.2S + umlal v25.2d, v3.2s, v18.2s umaddl x28, w19, w7, x28 - usra v21.2D, v20.2D, #25 + usra v21.2d, v20.2d, #25 umaddl x0, w29, w7, x20 - umlal v11.2D, v6.2S, v14.2S + umlal v11.2d, v6.2s, v14.2s umaddl x10, w11, w26, x30 - umlal v13.2D, v9.2S, v10.2S + umlal v13.2d, v9.2s, v10.2s umaddl x19, w29, w5, x28 - usra v27.2D, v21.2D, #26 + usra v27.2d, v21.2d, #26 umaddl x0, w3, w5, x0 - umlal v25.2D, v5.2S, v16.2S + umlal v25.2d, v5.2s, v16.2s umaddl x20, w1, w22, x17 - and v20.16B, v28.16B, v30.16B + and v20.16b, v28.16b, v30.16b umaddl x29, w3, w23, x19 - usra v29.2D, v27.2D, #25 + usra v29.2d, v27.2d, #25 umaddl x3, w1, w23, x0 - and v27.16B, v27.16B, v23.16B // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x4|z5 - umlal v11.2D, v8.2S, v12.2S + and v27.16b, v27.16b, v23.16b // ubignum_of_hreglist 7 + ubignum_of_lreglist 7 // INTERMEDIATE H|L = x4|z5 + umlal v11.2d, v8.2s, v12.2s umaddl x12, w15, w13, x29 - usra v13.2D, v29.2D, #26 + usra v13.2d, v29.2d, #26 umaddl x7, w11, w13, x3 - trn1 v6.4S, v6.4S, v7.4S + trn1 v6.4s, v6.4s, v7.4s umaddl x17, w11, w16, x20 - umlal v25.2D, v7.2S, v14.2S + umlal v25.2d, v7.2s, v14.2s and x23, x4, #0x3ffffff - bic v19.16B, v13.16B, v23.16B + bic v19.16b, v13.16b, v23.16b umaddl x19, w11, w6, x12 - and v28.16B, v13.16B, v23.16B // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = x4|z5 + and v28.16b, v13.16b, v23.16b // ubignum_of_hreglist 9 + ubignum_of_lreglist 9 // INTERMEDIATE H|L = x4|z5 add x3, x17, x7, lsr #26 - usra v11.2D, v19.2D, #25 - trn1 v2.4S, v2.4S, v3.4S + usra v11.2d, v19.2d, #25 + trn1 v2.4s, v2.4s, v3.4s add x17, x19, x3, lsr #25 - and v13.16B, v21.16B, v30.16B // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x4|z5 + and v13.16b, v21.16b, v30.16b // ubignum_of_hreglist 6 + ubignum_of_lreglist 6 // INTERMEDIATE H|L = x4|z5 and x5, x7, #0x3ffffff - usra v11.2D, v19.2D, #24 + usra v11.2d, v19.2d, #24 add x7, x10, x17, lsr #26 - trn1 v0.4S, v0.4S, v1.4S + trn1 v0.4s, v0.4s, v1.4s and x19, x24, #0x3ffffff - and v21.16B, v29.16B, v30.16B // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x4|z5 + and v21.16b, v29.16b, v30.16b // ubignum_of_hreglist 8 + ubignum_of_lreglist 8 // INTERMEDIATE H|L = x4|z5 add x29, x19, x7, lsr #25 - usra v11.2D, v19.2D, #21 + usra v11.2d, v19.2d, #21 bfi x5, x3, #32, #25 // ubignum_of_preglist 0 // INTERMEDIATE z4 - trn1 v17.4S, v13.4S, v27.4S // FINAL z3 + trn1 v17.4s, v13.4s, v27.4s // FINAL z3 add x19, x2, x29, lsr #26 - trn1 v19.4S, v21.4S, v28.4S // FINAL z3 + trn1 v19.4s, v21.4s, v28.4s // FINAL z3 and x3, x29, #0x3ffffff mov v16.d[0], v6.d[1] // FINAL x3 mov v6.d[0], v17.d[1] // FINAL x2 - trn1 v8.4S, v8.4S, v9.4S + trn1 v8.4s, v8.4s, v9.4s bfi x3, x19, #32, #26 // ubignum_of_preglist 2 // INTERMEDIATE z4 - and v21.16B, v11.16B, v30.16B // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x4|z5 + and v21.16b, v11.16b, v30.16b // ubignum_of_hreglist 0 + ubignum_of_lreglist 0 // INTERMEDIATE H|L = x4|z5 bfi x23, x21, #32, #26 // ubignum_of_preglist 4 // INTERMEDIATE z4 mov v18.d[0], v8.d[1] // FINAL x3 mov v8.d[0], v19.d[1] // FINAL x2 - umlal v25.2D, v9.2S, v12.2S + umlal v25.2d, v9.2s, v12.2s mov v9.d[0], x23 // FINAL z2 mov v7.d[0], x25 // FINAL z2 ldr d29, [mask1] mov v12.d[0], v2.d[1] // FINAL x3 - trn1 v4.4S, v4.4S, v5.4S + trn1 v4.4s, v4.4s, v5.4s and x17, x17, #0x3ffffff - usra v25.2D, v11.2D, #26 + usra v25.2d, v11.2d, #26 mov v10.d[0], v0.d[1] // FINAL x3 mov v14.d[0], v4.d[1] // FINAL x3 mov v4.d[0], v15.d[1] // FINAL x2 - usra v20.2D, v25.2D, #25 - and v27.16B, v25.16B, v23.16B // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 + usra v20.2d, v25.2d, #25 + and v27.16b, v25.16b, v23.16b // ubignum_of_hreglist 1 + ubignum_of_lreglist 1 // INTERMEDIATE H|L = x4|z5 bfi x17, x7, #32, #25 // ubignum_of_preglist 1 // INTERMEDIATE z4 mov v5.d[0], x3 // depth 86 mov v1.d[0], x5 // FINAL z2 - usra v26.2D, v20.2D, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 - and v28.16B, v20.16B, v30.16B // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 - trn1 v11.4S, v21.4S, v27.4S // FINAL z3 - trn1 v13.4S, v28.4S, v26.4S // FINAL z3 + usra v26.2d, v20.2d, #26 // ubignum_of_hreglist 3 + ubignum_of_lreglist 3 // INTERMEDIATE H|L = x4|z5 + and v28.16b, v20.16b, v30.16b // ubignum_of_hreglist 2 + ubignum_of_lreglist 2 // INTERMEDIATE H|L = x4|z5 + trn1 v11.4s, v21.4s, v27.4s // FINAL z3 + trn1 v13.4s, v28.4s, v26.4s // FINAL z3 mov v0.d[0], v11.d[1] // FINAL x2 mov v3.d[0], x17 // FINAL z2 mov v2.d[0], v13.d[1] // FINAL x2 diff --git a/util/fipstools/delocate/delocate.go b/util/fipstools/delocate/delocate.go index a1d9a57b26..4a55f6b3fb 100644 --- a/util/fipstools/delocate/delocate.go +++ b/util/fipstools/delocate/delocate.go @@ -508,7 +508,7 @@ func (d *delocation) processAarch64Instruction(statement, instruction *node32) ( argNodes := instructionArgs(instruction.next) switch instructionName { - case "ccmn", "ccmp", "cinc", "cinv", "cneg", "csel", "cset", "csetm", "csinc", "csinv", "csneg": + case "ccmn", "ccmp", "cinc", "cinv", "cneg", "csel", "cset", "csetm", "csinc", "csinv", "csneg", "fcsel": // These functions are special because they take a condition-code name as // an argument and that looks like a symbol reference. d.writeNode(statement)