From 7d9a99759d60574d87476b3acb26d5b953e63b84 Mon Sep 17 00:00:00 2001 From: dan pittman Date: Mon, 29 Jan 2024 16:00:26 -0800 Subject: [PATCH] address review comments --- crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl | 43 +---- crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl | 48 +---- crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl | 42 +---- crypto/fipsmodule/bn/exponentiation.c | 118 ++++++------- crypto/fipsmodule/bn/internal.h | 196 +++++++++++++++++++++ crypto/fipsmodule/bn/rsaz_exp_x2.c | 77 +------- 6 files changed, 271 insertions(+), 253 deletions(-) diff --git a/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl b/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl index 347de51cf29..950b05e86ac 100644 --- a/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl +++ b/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl @@ -75,7 +75,10 @@ *STDOUT=*OUT; if ($avx512ifma>0) {{{ -@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); + +@_6_args_universal_ABI = $win64 ? +("%rcx","%rdx","%r8","%r9","%r10","%r11") : +("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); $code.=<<___; .text @@ -95,26 +98,6 @@ ___ ############################################################################### -# Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52. -# -# AMM is defined as presented in the paper [1]. -# -# The input and output are presented in 2^52 radix domain, i.e. -# |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed. -# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 -# -# NB: the AMM implementation does not perform "conditional" subtraction step -# specified in the original algorithm as according to the Lemma 1 from the paper -# [2], the result will be always < 2*m and can be used as a direct input to -# the next AMM iteration. This post-condition is true, provided the correct -# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, -# which matches our case: 1040 > 1024 + 2 * 1. -# -# [1] Gueron, S. Efficient software implementations of modular exponentiation. -# DOI: 10.1007/s13389-012-0031-5 -# [2] Gueron, S. Enhanced Montgomery Multiplication. -# DOI: 10.1007/3-540-36400-5_5 -# # void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, # const BN_ULONG *a, # const BN_ULONG *b, @@ -122,7 +105,7 @@ # BN_ULONG k0); ############################################################################### { -# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") +# input parameters my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; my $mask52 = "%rax"; @@ -414,13 +397,6 @@ sub amm52x20_x1_norm { ___ ############################################################################### -# Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52 -# -# See description of ossl_rsaz_amm52x20_x1_ifma256() above for details about Almost -# Montgomery Multiplication algorithm and function input parameters description. -# -# This function does two AMMs for two independent inputs, hence dual. -# # void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20], # const BN_ULONG a[2][20], # const BN_ULONG b[2][20], @@ -522,19 +498,10 @@ sub amm52x20_x1_norm { } ############################################################################### -# Constant time extraction from the precomputed table of powers base^i, where -# i = 0..2^EXP_WIN_SIZE-1 -# -# The input |red_table| contains precomputations for two independent base values. -# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. -# -# Extracted value (output) is 2 20 digit numbers in 2^52 radix. -# # void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, # const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20], # int red_table_idx1, int red_table_idx2); # -# EXP_WIN_SIZE = 5 ############################################################################### { # input parameters diff --git a/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl b/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl index 600f5a3f269..b15b35e75ec 100644 --- a/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl +++ b/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl @@ -74,32 +74,11 @@ *STDOUT=*OUT; if ($avx512ifma>0) {{{ -@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); +@_6_args_universal_ABI = $win64 ? +("%rcx","%rdx","%r8","%r9","%r10","%r11") : +("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); ############################################################################### -# Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52. -# -# AMM is defined as presented in the paper [1]. -# -# The input and output are presented in 2^52 radix domain, i.e. -# |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed -# -# NOTE: the function uses zero-padded data - 2 high QWs is a padding. -# -# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 -# -# NB: the AMM implementation does not perform "conditional" subtraction step -# specified in the original algorithm as according to the Lemma 1 from the paper -# [2], the result will be always < 2*m and can be used as a direct input to -# the next AMM iteration. This post-condition is true, provided the correct -# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, -# which matches our case: 1560 > 1536 + 2 * 1. -# -# [1] Gueron, S. Efficient software implementations of modular exponentiation. -# DOI: 10.1007/s13389-012-0031-5 -# [2] Gueron, S. Enhanced Montgomery Multiplication. -# DOI: 10.1007/3-540-36400-5_5 -# # void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res, # const BN_ULONG *a, # const BN_ULONG *b, @@ -107,7 +86,7 @@ # BN_ULONG k0); ############################################################################### { -# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") +# input parameters my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; my $mask52 = "%rax"; @@ -504,15 +483,6 @@ sub amm52x30_x1_norm { ___ ############################################################################### -# Dual Almost Montgomery Multiplication for 30-digit number in radix 2^52 -# -# See description of ossl_rsaz_amm52x30_x1_ifma256() above for details about Almost -# Montgomery Multiplication algorithm and function input parameters description. -# -# This function does two AMMs for two independent inputs, hence dual. -# -# NOTE: the function uses zero-padded data - 2 high QWs is a padding. -# # void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32], # const BN_ULONG a[2][32], # const BN_ULONG b[2][32], @@ -659,20 +629,10 @@ sub amm52x30_x1_norm { } ############################################################################### -# Constant time extraction from the precomputed table of powers base^i, where -# i = 0..2^EXP_WIN_SIZE-1 -# -# The input |red_table| contains precomputations for two independent base values. -# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. -# -# Extracted value (output) is 2 (30 + 2) digits numbers in 2^52 radix. -# (2 high QW is zero padding) -# # void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y, # const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32], # int red_table_idx1, int red_table_idx2); # -# EXP_WIN_SIZE = 5 ############################################################################### { # input parameters diff --git a/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl b/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl index 9edd148658d..fbdc4d7c050 100644 --- a/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl +++ b/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl @@ -74,29 +74,11 @@ *STDOUT=*OUT; if ($avx512ifma>0) {{{ -@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); +@_6_args_universal_ABI = $win64 ? +("%rcx","%rdx","%r8","%r9","%r10","%r11") : +("%rdi","%rsi","%rdx","%rcx","%r8","%r9"); ############################################################################### -# Almost Montgomery Multiplication (AMM) for 40-digit number in radix 2^52. -# -# AMM is defined as presented in the paper [1]. -# -# The input and output are presented in 2^52 radix domain, i.e. -# |res|, |a|, |b|, |m| are arrays of 40 64-bit qwords with 12 high bits zeroed. -# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 -# -# NB: the AMM implementation does not perform "conditional" subtraction step -# specified in the original algorithm as according to the Lemma 1 from the paper -# [2], the result will be always < 2*m and can be used as a direct input to -# the next AMM iteration. This post-condition is true, provided the correct -# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, -# which matches our case: 2080 > 2048 + 2 * 1. -# -# [1] Gueron, S. Efficient software implementations of modular exponentiation. -# DOI: 10.1007/s13389-012-0031-5 -# [2] Gueron, S. Enhanced Montgomery Multiplication. -# DOI: 10.1007/3-540-36400-5_5 -# # void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res, # const BN_ULONG *a, # const BN_ULONG *b, @@ -104,7 +86,7 @@ # BN_ULONG k0); ############################################################################### { -# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8") +# input parameters my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI; my $mask52 = "%rax"; @@ -545,13 +527,6 @@ sub amm52x40_x1_norm { ___ ############################################################################### -# Dual Almost Montgomery Multiplication for 40-digit number in radix 2^52 -# -# See description of ossl_rsaz_amm52x40_x1_ifma256() above for details about Almost -# Montgomery Multiplication algorithm and function input parameters description. -# -# This function does two AMMs for two independent inputs, hence dual. -# # void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG out[2][40], # const BN_ULONG a[2][40], # const BN_ULONG b[2][40], @@ -706,19 +681,10 @@ sub amm52x40_x1_norm { } ############################################################################### -# Constant time extraction from the precomputed table of powers base^i, where -# i = 0..2^EXP_WIN_SIZE-1 -# -# The input |red_table| contains precomputations for two independent base values. -# |red_table_idx1| and |red_table_idx2| are corresponding power indexes. -# -# Extracted value (output) is 2 40 digits numbers in 2^52 radix. -# # void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y, # const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][40], # int red_table_idx1, int red_table_idx2); # -# EXP_WIN_SIZE = 5 ############################################################################### { # input parameters diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c index 2d133a900e9..31d1fd3e3d3 100644 --- a/crypto/fipsmodule/bn/exponentiation.c +++ b/crypto/fipsmodule/bn/exponentiation.c @@ -1266,78 +1266,78 @@ int BN_mod_exp_mont_consttime_x2(BIGNUM *rr1, const BIGNUM *a1, const BIGNUM *p1 const BIGNUM *m2, const BN_MONT_CTX *in_mont2, BN_CTX *ctx) { - int ret = 0; + int ret = 0; #ifdef RSAZ_512_ENABLED - BN_MONT_CTX *mont1 = NULL; - BN_MONT_CTX *mont2 = NULL; - - if (ossl_rsaz_avx512ifma_eligible() && - (((a1->width == 16) && (p1->width == 16) && (BN_num_bits(m1) == 1024) && - (a2->width == 16) && (p2->width == 16) && (BN_num_bits(m2) == 1024)) || - ((a1->width == 24) && (p1->width == 24) && (BN_num_bits(m1) == 1536) && - (a2->width == 24) && (p2->width == 24) && (BN_num_bits(m2) == 1536)) || - ((a1->width == 32) && (p1->width == 32) && (BN_num_bits(m1) == 2048) && - (a2->width == 32) && (p2->width == 32) && (BN_num_bits(m2) == 2048)))) { - - int widthn = a1->width; - /* Modulus bits of |m1| and |m2| are equal */ - int mod_bits = BN_num_bits(m1); - - if (!bn_wexpand(rr1, widthn)) - goto err; - if (!bn_wexpand(rr2, widthn)) - goto err; - - /* Ensure that montgomery contexts are initialized */ - if (in_mont1 == NULL) { - if ((mont1 = BN_MONT_CTX_new()) == NULL) - goto err; - if (!BN_MONT_CTX_set(mont1, m1, ctx)) - goto err; - in_mont1 = mont1; - } - if (in_mont2 == NULL) { - if ((mont2 = BN_MONT_CTX_new()) == NULL) - goto err; - if (!BN_MONT_CTX_set(mont2, m2, ctx)) - goto err; - in_mont2 = mont2; - } + BN_MONT_CTX *mont1 = NULL; + BN_MONT_CTX *mont2 = NULL; + + if (ossl_rsaz_avx512ifma_eligible() && + (((a1->width == 16) && (p1->width == 16) && (BN_num_bits(m1) == 1024) && + (a2->width == 16) && (p2->width == 16) && (BN_num_bits(m2) == 1024)) || + ((a1->width == 24) && (p1->width == 24) && (BN_num_bits(m1) == 1536) && + (a2->width == 24) && (p2->width == 24) && (BN_num_bits(m2) == 1536)) || + ((a1->width == 32) && (p1->width == 32) && (BN_num_bits(m1) == 2048) && + (a2->width == 32) && (p2->width == 32) && (BN_num_bits(m2) == 2048)))) { + + int widthn = a1->width; + /* Modulus bits of |m1| and |m2| are equal */ + int mod_bits = BN_num_bits(m1); + + if (!bn_wexpand(rr1, widthn)) + goto err; + if (!bn_wexpand(rr2, widthn)) + goto err; - ret = ossl_rsaz_mod_exp_avx512_x2(rr1->d, a1->d, p1->d, m1->d, - in_mont1->RR.d, in_mont1->n0[0], - rr2->d, a2->d, p2->d, m2->d, - in_mont2->RR.d, in_mont2->n0[0], - mod_bits); + /* Ensure that montgomery contexts are initialized */ + if (in_mont1 == NULL) { + if ((mont1 = BN_MONT_CTX_new()) == NULL) + goto err; + if (!BN_MONT_CTX_set(mont1, m1, ctx)) + goto err; + in_mont1 = mont1; + } + if (in_mont2 == NULL) { + if ((mont2 = BN_MONT_CTX_new()) == NULL) + goto err; + if (!BN_MONT_CTX_set(mont2, m2, ctx)) + goto err; + in_mont2 = mont2; + } - rr1->width = widthn; - rr1->neg = 0; - bn_set_minimal_width(rr1); + ret = ossl_rsaz_mod_exp_avx512_x2(rr1->d, a1->d, p1->d, m1->d, + in_mont1->RR.d, in_mont1->n0[0], + rr2->d, a2->d, p2->d, m2->d, + in_mont2->RR.d, in_mont2->n0[0], + mod_bits); - rr2->width = widthn; - rr2->neg = 0; - bn_set_minimal_width(rr2); + rr1->width = widthn; + rr1->neg = 0; + bn_set_minimal_width(rr1); - goto err; - - } + rr2->width = widthn; + rr2->neg = 0; + bn_set_minimal_width(rr2); + + goto err; + + } #endif - /* rr1 = a1^p1 mod m1 */ - ret = BN_mod_exp_mont_consttime(rr1, a1, p1, m1, ctx, in_mont1); - /* rr2 = a2^p2 mod m2 */ - ret &= BN_mod_exp_mont_consttime(rr2, a2, p2, m2, ctx, in_mont2); + /* rr1 = a1^p1 mod m1 */ + ret = BN_mod_exp_mont_consttime(rr1, a1, p1, m1, ctx, in_mont1); + /* rr2 = a2^p2 mod m2 */ + ret &= BN_mod_exp_mont_consttime(rr2, a2, p2, m2, ctx, in_mont2); #ifdef RSAZ_512_ENABLED err: - if (mont2) - BN_MONT_CTX_free(mont2); - if (mont1) - BN_MONT_CTX_free(mont1); + if (mont2) + BN_MONT_CTX_free(mont2); + if (mont1) + BN_MONT_CTX_free(mont1); #endif - return ret; + return ret; } int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p, diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h index bf82d32e9ae..1cb0f848242 100644 --- a/crypto/fipsmodule/bn/internal.h +++ b/crypto/fipsmodule/bn/internal.h @@ -790,6 +790,202 @@ void bn_little_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in, // leading zeros. void bn_words_to_little_endian(uint8_t *out, size_t out_len, const BN_ULONG *in, const size_t in_len); + +// Naming convention for the following functions: +// +// * amm: Almost Montgomery Multiplication +// * ams: Almost Montgomery Squaring +// * 52xZZ: data represented as array of ZZ digits in 52-bit radix +// * _x1_/_x2_: 1 or 2 independent inputs/outputs +// * ifma256: uses 256-bit wide IFMA ISA (AVX512_IFMA256) +// +// +// Almost Montgomery Multiplication (AMM) for 20-digit number in radix +// 2^52. +// +// AMM is defined as presented in the paper [1]. +// +// The input and output are presented in 2^52 radix domain, i.e. +// |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high +// bits zeroed. |k0| is a Montgomery coefficient, which is here k0 = +// -1/m mod 2^64 +// +// NB: the AMM implementation does not perform "conditional" +// subtraction step specified in the original algorithm as according +// to the Lemma 1 from the paper [2], the result will be always < 2*m +// and can be used as a direct input to the next AMM iteration. This +// post-condition is true, provided the correct parameter |s| (notion +// of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, which +// matches our case: 1040 > 1024 + 2 * 1. +// +// [1] Gueron, S. Efficient software implementations of modular +// exponentiation. DOI: 10.1007/s13389-012-0031-5 +// [2] Gueron, S. Enhanced Montgomery Multiplication. DOI: +// 10.1007/3-540-36400-5_5 +void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, const BN_ULONG *a, + const BN_ULONG *b, const BN_ULONG *m, + BN_ULONG k0); + +// Dual Almost Montgomery Multiplication for 20-digit number in radix +// 2^52 +// +// See description of ossl_rsaz_amm52x20_x1_ifma256() above for +// details about Almost Montgomery Multiplication algorithm and +// function input parameters description. +// +// This function does two AMMs for two independent inputs, hence dual. +void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG *out, const BN_ULONG *a, + const BN_ULONG *b, const BN_ULONG *m, + const BN_ULONG k0[2]); + +// Constant time extraction from the precomputed table of powers +// base^i, where i = 0..2^EXP_WIN_SIZE-1 +// +// The input |red_table| contains precomputations for two independent +// base values. |red_table_idx1| and |red_table_idx2| are +// corresponding power indexes. +// +// Extracted value (output) is 2 20 digit numbers in 2^52 radix. +// +// EXP_WIN_SIZE = 5 +void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, + const BN_ULONG *red_table, + int red_table_idx1, int red_table_idx2); + +// Almost Montgomery Multiplication (AMM) for 30-digit number in radix +// 2^52. +// +// AMM is defined as presented in the paper [1]. +// +// The input and output are presented in 2^52 radix domain, i.e. +// |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high +// bits zeroed +// +// NOTE: the function uses zero-padded data - 2 high QWs is a padding. +// +// |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64 +// +// NB: the AMM implementation does not perform "conditional" +// subtraction step specified in the original algorithm as according +// to the Lemma 1 from the paper [2], the result will be always < 2*m +// and can be used as a direct input to the next AMM iteration. This +// post-condition is true, provided the correct parameter |s| (notion +// of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, which +// matches our case: 1560 > 1536 + 2 * 1. +// +// [1] Gueron, S. Efficient software implementations of modular +// exponentiation. DOI: 10.1007/s13389-012-0031-5 +// [2] Gueron, S. Enhanced Montgomery Multiplication. DOI: +// 10.1007/3-540-36400-5_5 +void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res, const BN_ULONG *a, + const BN_ULONG *b, const BN_ULONG *m, + BN_ULONG k0); +// Dual Almost Montgomery Multiplication for 30-digit number in radix +// 2^52 +// +// See description of ossl_rsaz_amm52x30_x1_ifma256() above for +// details about Almost Montgomery Multiplication algorithm and +// function input parameters description. +// +// This function does two AMMs for two independent inputs, hence dual. +// +// NOTE: the function uses zero-padded data - 2 high QWs is a padding. +void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG *out, const BN_ULONG *a, + const BN_ULONG *b, const BN_ULONG *m, + const BN_ULONG k0[2]); + +// Constant time extraction from the precomputed table of powers +// base^i, where i = 0..2^EXP_WIN_SIZE-1 +// +// The input |red_table| contains precomputations for two independent +// base values. |red_table_idx1| and |red_table_idx2| are +// corresponding power indexes. +// +// Extracted value (output) is 2 (30 + 2) digits numbers in 2^52 +// radix. (2 high QW is zero padding) +// +// EXP_WIN_SIZE = 5 +void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y, + const BN_ULONG *red_table, + int red_table_idx1, int red_table_idx2); + +// Almost Montgomery Multiplication (AMM) for 40-digit number in radix +// 2^52. +// +// AMM is defined as presented in the paper [1]. +// +// The input and output are presented in 2^52 radix domain, i.e. +// |res|, |a|, |b|, |m| are arrays of 40 64-bit qwords with 12 high +// bits zeroed. |k0| is a Montgomery coefficient, which is here k0 = +// -1/m mod 2^64 +// +// NB: the AMM implementation does not perform "conditional" +// subtraction step specified in the original algorithm as according +// to the Lemma 1 from the paper [2], the result will be always < 2*m +// and can be used as a direct input to the next AMM iteration. This +// post-condition is true, provided the correct parameter |s| (notion +// of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k, which +// matches our case: 2080 > 2048 + 2 * 1. +// +// [1] Gueron, S. Efficient software implementations of modular +// exponentiation. DOI: 10.1007/s13389-012-0031-5 +// [2] Gueron, S. Enhanced Montgomery Multiplication. DOI: +// 10.1007/3-540-36400-5_5 +void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res, const BN_ULONG *a, + const BN_ULONG *b, const BN_ULONG *m, + BN_ULONG k0); + +// Dual Almost Montgomery Multiplication for 40-digit number in radix +// 2^52 +// +// See description of ossl_rsaz_amm52x40_x1_ifma256() above for +// details about Almost Montgomery Multiplication algorithm and +// function input parameters description. +// +// This function does two AMMs for two independent inputs, hence dual. +void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG *out, const BN_ULONG *a, + const BN_ULONG *b, const BN_ULONG *m, + const BN_ULONG k0[2]); + +// Constant time extraction from the precomputed table of powers base^i, where +// i = 0..2^EXP_WIN_SIZE-1 +// +// The input |red_table| contains precomputations for two independent base values. +// |red_table_idx1| and |red_table_idx2| are corresponding power indexes. +// +// Extracted value (output) is 2 40 digits numbers in 2^52 radix. +// +// EXP_WIN_SIZE = 5 +void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y, + const BN_ULONG *red_table, + int red_table_idx1, int red_table_idx2); + +// Dual {1024,1536,2048}-bit w-ary modular exponentiation using prime moduli of +// the same bit size using Almost Montgomery Multiplication, optimized with +// AVX512_IFMA256 ISA. +// +// The parameter w (window size) = 5. +// +// [out] res - result of modular exponentiation: 2x{20,30,40} qword +// values in 2^52 radix. +// [in] base - base (2x{20,30,40} qword values in 2^52 radix) +// [in] exp - array of 2 pointers to {16,24,32} qword values in 2^64 radix. +// Exponent is not converted to redundant representation. +// [in] m - moduli (2x{20,30,40} qword values in 2^52 radix) +// [in] rr - Montgomery parameter for 2 moduli: +// RR(1024) = 2^2080 mod m. +// RR(1536) = 2^3120 mod m. +// RR(2048) = 2^4160 mod m. +// (2x{20,30,40} qword values in 2^52 radix) +// [in] k0 - Montgomery parameter for 2 moduli: k0 = -1/m mod 2^64 +// +// \return (void). +static int RSAZ_mod_exp_x2_ifma256(BN_ULONG *res, const BN_ULONG *base, + const BN_ULONG *exp[2], const BN_ULONG *m, + const BN_ULONG *rr, const BN_ULONG k0[2], + int modulus_bitsize); + + #if defined(__cplusplus) } // extern C #endif diff --git a/crypto/fipsmodule/bn/rsaz_exp_x2.c b/crypto/fipsmodule/bn/rsaz_exp_x2.c index be0467c6bd7..c07fe93388a 100644 --- a/crypto/fipsmodule/bn/rsaz_exp_x2.c +++ b/crypto/fipsmodule/bn/rsaz_exp_x2.c @@ -32,11 +32,9 @@ static void *__dummy = &__dummy; #include #include #include +#include "../../internal.h" #include "rsaz_exp.h" -# define ALIGN_OF(ptr, boundary) \ - ((unsigned char *)(ptr) + (boundary - (((size_t)(ptr)) & (boundary - 1)))) - /* Internal radix */ # define DIGIT_SIZE (52) /* 52-bit mask */ @@ -62,53 +60,6 @@ OPENSSL_INLINE int number_of_digits(int bitsize, int digit_size) return (bitsize + digit_size - 1) / digit_size; } -/* - * For details of the methods declared below please refer to - * crypto/bn/asm/rsaz-avx512.pl - * - * Naming conventions: - * amm = Almost Montgomery Multiplication - * ams = Almost Montgomery Squaring - * 52xZZ - data represented as array of ZZ digits in 52-bit radix - * _x1_/_x2_ - 1 or 2 independent inputs/outputs - * _ifma256 - uses 256-bit wide IFMA ISA (AVX512_IFMA256) - */ - -void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, const BN_ULONG *a, - const BN_ULONG *b, const BN_ULONG *m, - BN_ULONG k0); -void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG *out, const BN_ULONG *a, - const BN_ULONG *b, const BN_ULONG *m, - const BN_ULONG k0[2]); -void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y, - const BN_ULONG *red_table, - int red_table_idx1, int red_table_idx2); - -void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res, const BN_ULONG *a, - const BN_ULONG *b, const BN_ULONG *m, - BN_ULONG k0); -void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG *out, const BN_ULONG *a, - const BN_ULONG *b, const BN_ULONG *m, - const BN_ULONG k0[2]); -void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y, - const BN_ULONG *red_table, - int red_table_idx1, int red_table_idx2); - -void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res, const BN_ULONG *a, - const BN_ULONG *b, const BN_ULONG *m, - BN_ULONG k0); -void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG *out, const BN_ULONG *a, - const BN_ULONG *b, const BN_ULONG *m, - const BN_ULONG k0[2]); -void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y, - const BN_ULONG *red_table, - int red_table_idx1, int red_table_idx2); - -static int RSAZ_mod_exp_x2_ifma256(BN_ULONG *res, const BN_ULONG *base, - const BN_ULONG *exp[2], const BN_ULONG *m, - const BN_ULONG *rr, const BN_ULONG k0[2], - int modulus_bitsize); - /* * Dual Montgomery modular exponentiation using prime moduli of the * same bit size, optimized with AVX512 ISA. @@ -198,7 +149,7 @@ int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1, storage = (BN_ULONG *)OPENSSL_malloc(storage_len_bytes); if (storage == NULL) goto err; - storage_aligned = (BN_ULONG *)ALIGN_OF(storage, 64); + storage_aligned = (BN_ULONG *)align_pointer(storage, 64); /* Memory layout for red(undant) representations */ base1_red = storage_aligned; @@ -273,28 +224,6 @@ int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1, return ret; } -/* - * Dual {1024,1536,2048}-bit w-ary modular exponentiation using prime moduli of - * the same bit size using Almost Montgomery Multiplication, optimized with - * AVX512_IFMA256 ISA. - * - * The parameter w (window size) = 5. - * - * [out] res - result of modular exponentiation: 2x{20,30,40} qword - * values in 2^52 radix. - * [in] base - base (2x{20,30,40} qword values in 2^52 radix) - * [in] exp - array of 2 pointers to {16,24,32} qword values in 2^64 radix. - * Exponent is not converted to redundant representation. - * [in] m - moduli (2x{20,30,40} qword values in 2^52 radix) - * [in] rr - Montgomery parameter for 2 moduli: - * RR(1024) = 2^2080 mod m. - * RR(1536) = 2^3120 mod m. - * RR(2048) = 2^4160 mod m. - * (2x{20,30,40} qword values in 2^52 radix) - * [in] k0 - Montgomery parameter for 2 moduli: k0 = -1/m mod 2^64 - * - * \return (void). - */ int RSAZ_mod_exp_x2_ifma256(BN_ULONG *out, const BN_ULONG *base, const BN_ULONG *exp[2], @@ -381,7 +310,7 @@ int RSAZ_mod_exp_x2_ifma256(BN_ULONG *out, if (storage == NULL) goto err; OPENSSL_cleanse(storage, storage_len_bytes); - storage_aligned = (BN_ULONG *)ALIGN_OF(storage, 64); + storage_aligned = (BN_ULONG *)align_pointer(storage, 64); red_Y = storage_aligned; red_X = red_Y + 2 * red_digits;