From e0494aafd9853b5460d338b6500429837572c25f Mon Sep 17 00:00:00 2001 From: Joel Date: Wed, 21 Jun 2023 10:49:12 -0400 Subject: [PATCH 1/3] build: not default to x64_asm. only use the asm if explicitly requested --- configure.ac | 4 ---- 1 file changed, 4 deletions(-) diff --git a/configure.ac b/configure.ac index a502d1304a..c81b0b1a46 100644 --- a/configure.ac +++ b/configure.ac @@ -266,10 +266,6 @@ else fi if test x"$req_asm" = x"auto"; then - SECP_X86_64_ASM_CHECK - if test x"$has_x86_64_asm" = x"yes"; then - set_asm=x86_64 - fi if test x"$set_asm" = x; then set_asm=no fi From d38ab07c01d93f9c01c84826fc322d06655a16ce Mon Sep 17 00:00:00 2001 From: Joel Date: Fri, 7 Jul 2023 09:43:25 +0930 Subject: [PATCH 2/3] added runtime check for bmi2 and adx --- src/secp256k1.c | 5 +++++ src/selftest.h | 28 ++++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/src/secp256k1.c b/src/secp256k1.c index 4c11e7f0b8..7faf445760 100644 --- a/src/secp256k1.c +++ b/src/secp256k1.c @@ -83,6 +83,11 @@ static int secp256k1_context_is_proper(const secp256k1_context* ctx) { } void secp256k1_selftest(void) { + if (!SECP256K1_CHECKMEM_RUNNING()) { + if (!secp256k1_selftest_cpuid()) { + secp256k1_callback_call(&default_error_callback, "required CPU flags are not present."); + } + } if (!secp256k1_selftest_passes()) { secp256k1_callback_call(&default_error_callback, "self test failed"); } diff --git a/src/selftest.h b/src/selftest.h index d083ac9524..37a1712a21 100644 --- a/src/selftest.h +++ b/src/selftest.h @@ -25,6 +25,34 @@ static int secp256k1_selftest_sha256(void) { return secp256k1_memcmp_var(out, output32, 32) == 0; } +static int secp256k1_selftest_cpuid(void) { + int ret = 1; + +#if defined(USE_ASM_X86_64) + /* getting the CPU flags from the cpu, more information in the Intel manual, + * Table 3-8 Information Returned by CPUID instruction (3-194, Vol.2A) + */ + const int CPU_FLAG_ENUMERATION = 7; + const int LEAF_NODE_ZERO = 0; + + /* for the cpu self test, we need BMI2 and ADX support */ + const int BIT_ADX = 19; + const int BIT_BMI2 = 8; + int flags = 0; + int has_adx = 0; + int has_bmi2 = 0; + __asm__ __volatile__("cpuid\n" + : "=b"(flags) + : "a"(CPU_FLAG_ENUMERATION), "c"(LEAF_NODE_ZERO) + : "rdx", "cc"); + + has_adx = (flags >> BIT_ADX) & 1; + has_bmi2 = (flags >> BIT_BMI2) & 1; + ret = has_adx && has_bmi2; +#endif + return ret; +} + static int secp256k1_selftest_passes(void) { return secp256k1_selftest_sha256(); } From 4742310ff47a61373715e01b9330604114052aff Mon Sep 17 00:00:00 2001 From: Joel Date: Fri, 7 Jul 2023 09:44:16 +0930 Subject: [PATCH 3/3] replace the asm implementation for mul/square inner --- README.md | 2 +- src/field_5x52_asm_impl.h | 819 +++++++++++++++----------------------- 2 files changed, 330 insertions(+), 491 deletions(-) diff --git a/README.md b/README.md index 19dabe8505..86deb0a205 100644 --- a/README.md +++ b/README.md @@ -34,7 +34,7 @@ Implementation details * Expose only higher level interfaces to minimize the API surface and improve application security. ("Be difficult to use insecurely.") * Field operations * Optimized implementation of arithmetic modulo the curve's field size (2^256 - 0x1000003D1). - * Using 5 52-bit limbs (including hand-optimized assembly for x86_64, by Diederik Huys). + * Using 5 52-bit limbs (including [CryptOpt](https://github.com/0xADE1A1DE/CryptOpt)-optimized assembly for x86_64, which includes formal correctness proofs). * Using 10 26-bit limbs (including hand-optimized assembly for 32-bit ARM, by Wladimir J. van der Laan). * This is an experimental feature that has not received enough scrutiny to satisfy the standard of quality of this library but is made available for testing and review by the community. * Scalar operations diff --git a/src/field_5x52_asm_impl.h b/src/field_5x52_asm_impl.h index 04a9af2105..c91f13b46a 100644 --- a/src/field_5x52_asm_impl.h +++ b/src/field_5x52_asm_impl.h @@ -1,14 +1,5 @@ -/*********************************************************************** - * Copyright (c) 2013-2014 Diederik Huys, Pieter Wuille * - * Distributed under the MIT software license, see the accompanying * - * file COPYING or https://www.opensource.org/licenses/mit-license.php.* - ***********************************************************************/ - /** - * Changelog: - * - March 2013, Diederik Huys: original version - * - November 2014, Pieter Wuille: updated to use Peter Dettman's parallel multiplication algorithm - * - December 2014, Pieter Wuille: converted from YASM to GCC inline assembly + * Generated by CryptOpt (https://github.com/0xADE1A1DE/CryptOpt) */ #ifndef SECP256K1_FIELD_INNER5X52_IMPL_H @@ -16,489 +7,337 @@ #include "util.h" -SECP256K1_INLINE static void secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t * SECP256K1_RESTRICT b) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * r15:rcx = d - * r10-r14 = a0-a4 - * rbx = b - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" +SECP256K1_INLINE static void +secp256k1_fe_mul_inner(uint64_t *r, const uint64_t *a, const uint64_t *SECP256K1_RESTRICT b) { + uint64_t tmp0, tmp1, tmp2, tmp3; + __asm__ __volatile__( + "mov %%rdx,%%rax\n" + "mov 0x10(%%rdx),%%rdx\n" + "mulx 0x8(%%rsi),%%r10,%%r11\n" + "mov 0x20(%%rax),%%rdx\n" + "mulx 0x20(%%rsi),%%rcx,%%r8\n" + "mov (%%rax),%%rdx\n" + "mulx 0x18(%%rsi),%%r9,%%rbx\n" + "mov 0x10(%%rsi),%%rdx\n" + "mulx 0x8(%%rax),%%r12,%%r13\n" + "xor %%rdx,%%rdx\n" + "adox %%r12,%%r9\n" + "adox %%rbx,%%r13\n" + "mov 0x18(%%rax),%%rdx\n" + "mulx (%%rsi),%%rbx,%%r12\n" + "adcx %%r10,%%r9\n" + "adcx %%r13,%%r11\n" + "add %%rbx,%%r9\n" + "adcx %%r11,%%r12\n" + "mov 0x18(%%rsi),%%rdx\n" + "mulx 0x8(%%rax),%%r10,%%r13\n" + "mov (%%rax),%%rdx\n" + "mulx 0x20(%%rsi),%%rbx,%%r11\n" + "movabs $0x1000003d10,%%rdx\n" + "mulx %%rcx,%%r14,%%r15\n" + "add %%r9,%%r14\n" + "adcx %%r15,%%r12\n" + "mov %%r14,%%r9\n" + "shrd $0x34,%%r12,%%r9\n" + "mov 0x10(%%rsi),%%rdx\n" + "mulx 0x10(%%rax),%%rcx,%%r15\n" + "xor %%rdx,%%rdx\n" + "adox %%r10,%%rbx\n" + "adox %%r11,%%r13\n" + "adcx %%rcx,%%rbx\n" + "adcx %%r13,%%r15\n" + "mov 0x18(%%rax),%%rdx\n" + "mulx 0x8(%%rsi),%%r10,%%r11\n" + "add %%r10,%%rbx\n" + "adcx %%r15,%%r11\n" + "mov (%%rsi),%%rdx\n" + "mulx 0x20(%%rax),%%r12,%%rcx\n" + "xor %%rdx,%%rdx\n" + "adox %%r12,%%rbx\n" + "adox %%r11,%%rcx\n" + "movabs $0x1000003d10000,%%r13\n" + "mov %%r13,%%rdx\n" + "mulx %%r8,%%r13,%%r15\n" + "adcx %%r9,%%rbx\n" + "adc $0x0,%%rcx\n" + "add %%rbx,%%r13\n" + "adcx %%r15,%%rcx\n" + "movabs $0xfffffffffffff,%%r8\n" + "mov %%r13,%%r9\n" + "and %%r8,%%r9\n" + "mov %%r9,%%r10\n" + "shr $0x30,%%r10\n" + "mov 0x18(%%rsi),%%rdx\n" + "mulx 0x10(%%rax),%%r11,%%r12\n" + "mov 0x20(%%rsi),%%rdx\n" + "mulx 0x8(%%rax),%%r15,%%rbx\n" + "xor %%rdx,%%rdx\n" + "adox %%r11,%%r15\n" + "adox %%rbx,%%r12\n" + "mov 0x10(%%rsi),%%rdx\n" + "mulx 0x18(%%rax),%%r11,%%rbx\n" + "adcx %%r11,%%r15\n" + "adcx %%r12,%%rbx\n" + "mov 0x8(%%rsi),%%rdx\n" + "mulx 0x20(%%rax),%%r12,%%r11\n" + "mov 0x18(%%rax),%%rdx\n" + "mov %%rdi,%q0\n" + "mulx 0x18(%%rsi),%%r8,%%rdi\n" + "shrd $0x34,%%rcx,%%r13\n" + "add %%r12,%%r15\n" + "adcx %%rbx,%%r11\n" + "mov 0x20(%%rsi),%%rdx\n" + "mulx 0x10(%%rax),%%rcx,%%rbx\n" + "xor %%rdx,%%rdx\n" + "adox %%r13,%%r15\n" + "adox %%rdx,%%r11\n" + "mov %%r15,%%r12\n" + "shrd $0x34,%%r11,%%r12\n" + "xor %%r13,%%r13\n" + "adox %%r8,%%rcx\n" + "adox %%rbx,%%rdi\n" + "mov 0x10(%%rsi),%%rdx\n" + "mulx 0x20(%%rax),%%r8,%%rbx\n" + "adcx %%r8,%%rcx\n" + "adcx %%rdi,%%rbx\n" + "xor %%rdx,%%rdx\n" + "adox %%r12,%%rcx\n" + "adox %%rdx,%%rbx\n" + "movabs $0xfffffffffffff,%%r13\n" + "and %%r13,%%r15\n" + "shl $0x4,%%r15\n" + "mov %%rcx,%%r11\n" + "shrd $0x34,%%rbx,%%r11\n" + "lea (%%r10,%%r15,1),%%r10\n" + "movabs $0x1000003d1,%%r12\n" + "mov %%r10,%%rdx\n" + "mulx %%r12,%%r10,%%rdi\n" + "movabs $0xffffffffffff,%%r8\n" + "and %%r8,%%r9\n" + "mov (%%rax),%%rdx\n" + "mulx 0x8(%%rsi),%%rbx,%%r15\n" + "mov (%%rax),%%rdx\n" + "mulx (%%rsi),%%r8,%%r12\n" + "adox %%r8,%%r10\n" + "adox %%rdi,%%r12\n" + "mov 0x8(%%rax),%%rdx\n" + "mulx (%%rsi),%%rdi,%%r8\n" + "mov 0x8(%%rsi),%%rdx\n" + "mov %%r9,%q1\n" + "mulx 0x8(%%rax),%%r13,%%r9\n" + "mov 0x18(%%rax),%%rdx\n" + "mov %%r9,%q2\n" + "mov %%r13,%q3\n" + "mulx 0x20(%%rsi),%%r9,%%r13\n" + "mov %%r10,%%rdx\n" + "shrd $0x34,%%r12,%%rdx\n" + "add %%rdi,%%rbx\n" + "adcx %%r15,%%r8\n" + "add %%rdx,%%rbx\n" + "adc $0x0,%%r8\n" + "mov 0x20(%%rax),%%rdx\n" + "mulx 0x18(%%rsi),%%r15,%%r12\n" + "xor %%rdx,%%rdx\n" + "adox %%r15,%%r9\n" + "adox %%r13,%%r12\n" + "adcx %%r11,%%r9\n" + "adc $0x0,%%r12\n" + "mov 0x10(%%rax),%%rdx\n" + "mulx (%%rsi),%%r11,%%rdi\n" + "movabs $0xfffffffffffff,%%rdx\n" + "and %%rdx,%%rcx\n" + "movabs $0x1000003d10,%%r13\n" + "mov %%r13,%%rdx\n" + "mulx %%rcx,%%r13,%%r15\n" + "adox %%rbx,%%r13\n" + "adox %%r15,%%r8\n" + "mulx %%r9,%%rbx,%%rcx\n" + "mov (%%rax),%%rdx\n" + "mulx 0x10(%%rsi),%%r9,%%r15\n" + "adcx %q3,%%r9\n" + "adcx %q2,%%r15\n" + "mov %%r13,%%rdx\n" + "shrd $0x34,%%r8,%%rdx\n" + "xor %%r8,%%r8\n" + "adox %%r11,%%r9\n" + "adox %%r15,%%rdi\n" + "adcx %%rdx,%%r9\n" + "adc $0x0,%%rdi\n" + "movabs $0xfffffffffffff,%%r11\n" + "and %%r11,%%r10\n" + "adox %%r9,%%rbx\n" + "adox %%rcx,%%rdi\n" + "mov %%rbx,%%rcx\n" + "and %%r11,%%rcx\n" + "shrd $0x34,%%rdi,%%rbx\n" + "mov %q0,%%r15\n" + "mov %%r10,(%%r15)\n" + "and %%r11,%%r14\n" + "movabs $0x1000003d10000,%%rdx\n" + "mulx %%r12,%%r9,%%r10\n" + "lea (%%rbx,%%r14,1),%%rbx\n" + "adox %%rbx,%%r9\n" + "adox %%r8,%%r10\n" + "mov %%r9,%%r12\n" + "and %%r11,%%r12\n" + "mov %%r12,0x18(%%r15)\n" + "shrd $0x34,%%r10,%%r9\n" + "add %q1,%%r9\n" + "mov %%r9,0x20(%%r15)\n" + "and %%r11,%%r13\n" + "mov %%r13,0x8(%%r15)\n" + "mov %%rcx,0x10(%%r15)\n" - /* d += a3 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rcx\n" - "movq %%rdx,%%r15\n" - /* d += a2 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d = a0 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c = a4 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* d += a4 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a0 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* t4 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a4 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a1 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* u0 = d & M (%%rsi) */ - "movq %%rcx,%%rsi\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a1 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a4 * b2 */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a2 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%r15,%%rcx\n" - "xorq %%r15,%%r15\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a2 * b0 */ - "movq 0(%%rbx),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a1 * b1 */ - "movq 8(%%rbx),%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* c += a0 * b2 (last use of %%r10 = a0) */ - "movq 16(%%rbx),%%rax\n" - "mulq %%r10\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0), t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* d += a4 * b3 */ - "movq 24(%%rbx),%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* d += a3 * b4 */ - "movq 32(%%rbx),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rcx\n" - "adcq %%rdx,%%r15\n" - /* c += (d & M) * R */ - "movq %%rcx,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rcx only) */ - "shrdq $52,%%r15,%%rcx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rcx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "movq $0xfffffffffffff,%%rdx\n" - "andq %%rdx,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=&m"(tmp1), "=&m"(tmp2), "=&m"(tmp3) -: "b"(b), "D"(r) -: "%rax", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); + : "=&m"(tmp0), "=&m"(tmp1), "=&m"(tmp2), "=&m"(tmp3), "+D"(r), "+d"(b) + : "S"(a) + : "rax", "rbx", "rcx", "r8", "r9", "r10", "r11", "r12", "r13", + "r14", "r15", "cc", "memory"); } SECP256K1_INLINE static void secp256k1_fe_sqr_inner(uint64_t *r, const uint64_t *a) { -/** - * Registers: rdx:rax = multiplication accumulator - * r9:r8 = c - * rcx:rbx = d - * r10-r14 = a0-a4 - * r15 = M (0xfffffffffffff) - * rdi = r - * rsi = a / t? - */ - uint64_t tmp1, tmp2, tmp3; -__asm__ __volatile__( - "movq 0(%%rsi),%%r10\n" - "movq 8(%%rsi),%%r11\n" - "movq 16(%%rsi),%%r12\n" - "movq 24(%%rsi),%%r13\n" - "movq 32(%%rsi),%%r14\n" - "movq $0xfffffffffffff,%%r15\n" - /* d = (a0*2) * a3 */ - "leaq (%%r10,%%r10,1),%%rax\n" - "mulq %%r13\n" - "movq %%rax,%%rbx\n" - "movq %%rdx,%%rcx\n" - /* d += (a1*2) * a2 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c = a4 * a4 */ - "movq %%r14,%%rax\n" - "mulq %%r14\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += (c & M) * R */ - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* t3 (tmp1) = d & M */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - "movq %%rsi,%q1\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* a4 *= 2 */ - "addq %%r14,%%r14\n" - /* d += a0 * a4 */ - "movq %%r10,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d+= (a1*2) * a3 */ - "leaq (%%r11,%%r11,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a2 * a2 */ - "movq %%r12,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += c * R */ - "movq %%r8,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* t4 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* tx = t4 >> 48 (tmp3) */ - "movq %%rsi,%%rax\n" - "shrq $48,%%rax\n" - "movq %%rax,%q3\n" - /* t4 &= (M >> 4) (tmp2) */ - "movq $0xffffffffffff,%%rax\n" - "andq %%rax,%%rsi\n" - "movq %%rsi,%q2\n" - /* c = a0 * a0 */ - "movq %%r10,%%rax\n" - "mulq %%r10\n" - "movq %%rax,%%r8\n" - "movq %%rdx,%%r9\n" - /* d += a1 * a4 */ - "movq %%r11,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += (a2*2) * a3 */ - "leaq (%%r12,%%r12,1),%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* u0 = d & M (%%rsi) */ - "movq %%rbx,%%rsi\n" - "andq %%r15,%%rsi\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* u0 = (u0 << 4) | tx (%%rsi) */ - "shlq $4,%%rsi\n" - "movq %q3,%%rax\n" - "orq %%rax,%%rsi\n" - /* c += u0 * (R >> 4) */ - "movq $0x1000003d1,%%rax\n" - "mulq %%rsi\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[0] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,0(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* a0 *= 2 */ - "addq %%r10,%%r10\n" - /* c += a0 * a1 */ - "movq %%r10,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a2 * a4 */ - "movq %%r12,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* d += a3 * a3 */ - "movq %%r13,%%rax\n" - "mulq %%r13\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 */ - "shrdq $52,%%rcx,%%rbx\n" - "xorq %%rcx,%%rcx\n" - /* r[1] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,8(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += a0 * a2 (last use of %%r10) */ - "movq %%r10,%%rax\n" - "mulq %%r12\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* fetch t3 (%%r10, overwrites a0),t4 (%%rsi) */ - "movq %q2,%%rsi\n" - "movq %q1,%%r10\n" - /* c += a1 * a1 */ - "movq %%r11,%%rax\n" - "mulq %%r11\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d += a3 * a4 */ - "movq %%r13,%%rax\n" - "mulq %%r14\n" - "addq %%rax,%%rbx\n" - "adcq %%rdx,%%rcx\n" - /* c += (d & M) * R */ - "movq %%rbx,%%rax\n" - "andq %%r15,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* d >>= 52 (%%rbx only) */ - "shrdq $52,%%rcx,%%rbx\n" - /* r[2] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,16(%%rdi)\n" - /* c >>= 52 */ - "shrdq $52,%%r9,%%r8\n" - "xorq %%r9,%%r9\n" - /* c += t3 */ - "addq %%r10,%%r8\n" - /* c += d * R */ - "movq %%rbx,%%rax\n" - "movq $0x1000003d10,%%rdx\n" - "mulq %%rdx\n" - "addq %%rax,%%r8\n" - "adcq %%rdx,%%r9\n" - /* r[3] = c & M */ - "movq %%r8,%%rax\n" - "andq %%r15,%%rax\n" - "movq %%rax,24(%%rdi)\n" - /* c >>= 52 (%%r8 only) */ - "shrdq $52,%%r9,%%r8\n" - /* c += t4 (%%r8 only) */ - "addq %%rsi,%%r8\n" - /* r[4] = c */ - "movq %%r8,32(%%rdi)\n" -: "+S"(a), "=&m"(tmp1), "=&m"(tmp2), "=&m"(tmp3) -: "D"(r) -: "%rax", "%rbx", "%rcx", "%rdx", "%r8", "%r9", "%r10", "%r11", "%r12", "%r13", "%r14", "%r15", "cc", "memory" -); -} + uint64_t tmp0, tmp1; + __asm__ __volatile__( -#endif /* SECP256K1_FIELD_INNER5X52_IMPL_H */ + "mov (%%rsi),%%rax\n" + "lea (%%rax,%%rax,1),%%r10\n" + "mov 0x18(%%rsi),%%rdx\n" + "mulx %%r10,%%rax,%%r11\n" + "mov 0x20(%%rsi),%%rdx\n" + "mulx %%rdx,%%rcx,%%r8\n" + "mov 0x8(%%rsi),%%rdx\n" + "lea (%%rdx,%%rdx,1),%%r9\n" + "mov 0x10(%%rsi),%%rdx\n" + "mulx %%r9,%%rbx,%%r12\n" + "mov 0x10(%%rsi),%%rdx\n" + "mov %%rdx,%%r13\n" + "shl %%r13\n" + "xor %%rdx,%%rdx\n" + "adox %%rax,%%rbx\n" + "adox %%r12,%%r11\n" + "movabs $0x1000003d10,%%rax\n" + "mov %%rcx,%%rdx\n" + "mulx %%rax,%%rcx,%%r12\n" + "adcx %%rbx,%%rcx\n" + "adcx %%r12,%%r11\n" + "mov 0x10(%%rsi),%%rdx\n" + "mulx %%rdx,%%rbx,%%r12\n" + "mov 0x18(%%rsi),%%rdx\n" + "mulx %%r9,%%r14,%%r15\n" + "mov %%rcx,%%rdx\n" + "shrd $0x34,%%r11,%%rdx\n" + "xor %%r11,%%r11\n" + "adox %%r14,%%rbx\n" + "adox %%r12,%%r15\n" + "mov %%rdx,%%r12\n" + "mov 0x20(%%rsi),%%rdx\n" + "mulx %%r10,%%r14,%%r11\n" + "adcx %%r14,%%rbx\n" + "adcx %%r15,%%r11\n" + "movabs $0x1000003d10000,%%rdx\n" + "mulx %%r8,%%r15,%%r14\n" + "xor %%r8,%%r8\n" + "adox %%r12,%%rbx\n" + "adox %%r8,%%r11\n" + "adcx %%rbx,%%r15\n" + "adcx %%r14,%%r11\n" + "mov %%r15,%%r12\n" + "shrd $0x34,%%r11,%%r12\n" + "movabs $0xfffffffffffff,%%r14\n" + "and %%r14,%%rcx\n" + "mov %%r9,%%rdx\n" + "mulx 0x20(%%rsi),%%r9,%%rbx\n" + "mov %%r13,%%rdx\n" + "mulx 0x18(%%rsi),%%r13,%%r11\n" + "adox %%r9,%%r13\n" + "adox %%r11,%%rbx\n" + "adcx %%r12,%%r13\n" + "adc $0x0,%%rbx\n" + "mov %%r13,%%r12\n" + "and %%r14,%%r12\n" + "mulx 0x20(%%rsi),%%r9,%%r11\n" + "shrd $0x34,%%rbx,%%r13\n" + "mov 0x18(%%rsi),%%rdx\n" + "mulx %%rdx,%%rbx,%%r8\n" + "xor %%rdx,%%rdx\n" + "adox %%r9,%%rbx\n" + "adox %%r8,%%r11\n" + "adcx %%r13,%%rbx\n" + "adc $0x0,%%r11\n" + "mov %%rbx,%%r9\n" + "shrd $0x34,%%r11,%%r9\n" + "imul $0x2,0x18(%%rsi),%%r13\n" + "and %%r14,%%r15\n" + "and %%r14,%%rbx\n" + "mov 0x20(%%rsi),%%rdx\n" + "mulx %%r13,%%r8,%%r11\n" + "mov %%r15,%%rdx\n" + "shr $0x30,%%rdx\n" + "shl $0x4,%%r12\n" + "lea (%%rdx,%%r12,1),%%rdx\n" + "movabs $0x1000003d1,%%r13\n" + "mulx %%r13,%%r12,%%r14\n" + "xor %%rdx,%%rdx\n" + "adox %%r9,%%r8\n" + "adox %%rdx,%%r11\n" + "mov (%%rsi),%%rdx\n" + "mulx %%rdx,%%r9,%%r13\n" + "adcx %%r9,%%r12\n" + "adcx %%r14,%%r13\n" + "mov %%rax,%%rdx\n" + "mulx %%rbx,%%rax,%%r14\n" + "mov %%r12,%%rbx\n" + "shrd $0x34,%%r13,%%rbx\n" + "mulx %%r8,%%r9,%%r13\n" + "mov %%r10,%%rdx\n" + "mulx 0x10(%%rsi),%%r10,%%r8\n" + "mov %%rdi,%q0\n" + "mov %%r11,%q1\n" + "mulx 0x8(%%rsi),%%rdi,%%r11\n" + "xor %%rdx,%%rdx\n" + "adox %%rbx,%%rdi\n" + "adox %%rdx,%%r11\n" + "adcx %%rdi,%%rax\n" + "adcx %%r14,%%r11\n" + "movabs $0xfffffffffffff,%%r14\n" + "and %%r14,%%r12\n" + "mov %%rax,%%rbx\n" + "shrd $0x34,%%r11,%%rbx\n" + "mov %q0,%%rdi\n" + "mov %%r12,(%%rdi)\n" + "mov 0x8(%%rsi),%%rdx\n" + "mulx %%rdx,%%r11,%%r12\n" + "xor %%rdx,%%rdx\n" + "adox %%r10,%%r11\n" + "adox %%r12,%%r8\n" + "adcx %%rbx,%%r11\n" + "adc $0x0,%%r8\n" + "add %%r11,%%r9\n" + "adcx %%r13,%%r8\n" + "mov %%r9,%%r13\n" + "shrd $0x34,%%r8,%%r13\n" + "lea 0x0(%%r13,%%rcx,1),%%r13\n" + "movabs $0x1000003d10000,%%rcx\n" + "mov %%rcx,%%rdx\n" + "mulx %q1,%%rcx,%%r10\n" + "add %%r13,%%rcx\n" + "adc $0x0,%%r10\n" + "mov %%rcx,%%rbx\n" + "shrd $0x34,%%r10,%%rbx\n" + "and %%r14,%%rax\n" + "mov %%rax,0x8(%%rdi)\n" + "movabs $0xffffffffffff,%%r12\n" + "and %%r12,%%r15\n" + "lea (%%rbx,%%r15,1),%%rbx\n" + "and %%r14,%%rcx\n" + "and %%r14,%%r9\n" + "mov %%r9,0x10(%%rdi)\n" + "mov %%rcx,0x18(%%rdi)\n" + "mov %%rbx,0x20(%%rdi)\n" + + : "=&m"(tmp0), "=&m"(tmp1), "+D"(r) + : "D"(r), "S"(a) + : "rax", "rbx", "rcx", "rdx", "r8", "r9", "r10", "r11", "r12", + "r13", "r14", "r15", "cc", "memory"); +} +#endif