Skip to content

Commit

Permalink
address review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
pittma committed Jan 30, 2024
1 parent ac0ac9f commit 7d9a997
Show file tree
Hide file tree
Showing 6 changed files with 271 additions and 253 deletions.
43 changes: 5 additions & 38 deletions crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,10 @@
*STDOUT=*OUT;

if ($avx512ifma>0) {{{
@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");

@_6_args_universal_ABI = $win64 ?
("%rcx","%rdx","%r8","%r9","%r10","%r11") :
("%rdi","%rsi","%rdx","%rcx","%r8","%r9");

$code.=<<___;
.text
Expand All @@ -95,34 +98,14 @@
___

###############################################################################
# Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52.
#
# AMM is defined as presented in the paper [1].
#
# The input and output are presented in 2^52 radix domain, i.e.
# |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed.
# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
#
# NB: the AMM implementation does not perform "conditional" subtraction step
# specified in the original algorithm as according to the Lemma 1 from the paper
# [2], the result will be always < 2*m and can be used as a direct input to
# the next AMM iteration. This post-condition is true, provided the correct
# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k,
# which matches our case: 1040 > 1024 + 2 * 1.
#
# [1] Gueron, S. Efficient software implementations of modular exponentiation.
# DOI: 10.1007/s13389-012-0031-5
# [2] Gueron, S. Enhanced Montgomery Multiplication.
# DOI: 10.1007/3-540-36400-5_5
#
# void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res,
# const BN_ULONG *a,
# const BN_ULONG *b,
# const BN_ULONG *m,
# BN_ULONG k0);
###############################################################################
{
# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
# input parameters
my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;

my $mask52 = "%rax";
Expand Down Expand Up @@ -414,13 +397,6 @@ sub amm52x20_x1_norm {
___

###############################################################################
# Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52
#
# See description of ossl_rsaz_amm52x20_x1_ifma256() above for details about Almost
# Montgomery Multiplication algorithm and function input parameters description.
#
# This function does two AMMs for two independent inputs, hence dual.
#
# void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20],
# const BN_ULONG a[2][20],
# const BN_ULONG b[2][20],
Expand Down Expand Up @@ -522,19 +498,10 @@ sub amm52x20_x1_norm {
}

###############################################################################
# Constant time extraction from the precomputed table of powers base^i, where
# i = 0..2^EXP_WIN_SIZE-1
#
# The input |red_table| contains precomputations for two independent base values.
# |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
#
# Extracted value (output) is 2 20 digit numbers in 2^52 radix.
#
# void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y,
# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20],
# int red_table_idx1, int red_table_idx2);
#
# EXP_WIN_SIZE = 5
###############################################################################
{
# input parameters
Expand Down
48 changes: 4 additions & 44 deletions crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl
Original file line number Diff line number Diff line change
Expand Up @@ -74,40 +74,19 @@
*STDOUT=*OUT;

if ($avx512ifma>0) {{{
@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
@_6_args_universal_ABI = $win64 ?
("%rcx","%rdx","%r8","%r9","%r10","%r11") :
("%rdi","%rsi","%rdx","%rcx","%r8","%r9");

###############################################################################
# Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52.
#
# AMM is defined as presented in the paper [1].
#
# The input and output are presented in 2^52 radix domain, i.e.
# |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed
#
# NOTE: the function uses zero-padded data - 2 high QWs is a padding.
#
# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
#
# NB: the AMM implementation does not perform "conditional" subtraction step
# specified in the original algorithm as according to the Lemma 1 from the paper
# [2], the result will be always < 2*m and can be used as a direct input to
# the next AMM iteration. This post-condition is true, provided the correct
# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k,
# which matches our case: 1560 > 1536 + 2 * 1.
#
# [1] Gueron, S. Efficient software implementations of modular exponentiation.
# DOI: 10.1007/s13389-012-0031-5
# [2] Gueron, S. Enhanced Montgomery Multiplication.
# DOI: 10.1007/3-540-36400-5_5
#
# void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res,
# const BN_ULONG *a,
# const BN_ULONG *b,
# const BN_ULONG *m,
# BN_ULONG k0);
###############################################################################
{
# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
# input parameters
my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;

my $mask52 = "%rax";
Expand Down Expand Up @@ -504,15 +483,6 @@ sub amm52x30_x1_norm {
___

###############################################################################
# Dual Almost Montgomery Multiplication for 30-digit number in radix 2^52
#
# See description of ossl_rsaz_amm52x30_x1_ifma256() above for details about Almost
# Montgomery Multiplication algorithm and function input parameters description.
#
# This function does two AMMs for two independent inputs, hence dual.
#
# NOTE: the function uses zero-padded data - 2 high QWs is a padding.
#
# void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32],
# const BN_ULONG a[2][32],
# const BN_ULONG b[2][32],
Expand Down Expand Up @@ -659,20 +629,10 @@ sub amm52x30_x1_norm {
}

###############################################################################
# Constant time extraction from the precomputed table of powers base^i, where
# i = 0..2^EXP_WIN_SIZE-1
#
# The input |red_table| contains precomputations for two independent base values.
# |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
#
# Extracted value (output) is 2 (30 + 2) digits numbers in 2^52 radix.
# (2 high QW is zero padding)
#
# void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y,
# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32],
# int red_table_idx1, int red_table_idx2);
#
# EXP_WIN_SIZE = 5
###############################################################################
{
# input parameters
Expand Down
42 changes: 4 additions & 38 deletions crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl
Original file line number Diff line number Diff line change
Expand Up @@ -74,37 +74,19 @@
*STDOUT=*OUT;

if ($avx512ifma>0) {{{
@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
@_6_args_universal_ABI = $win64 ?
("%rcx","%rdx","%r8","%r9","%r10","%r11") :
("%rdi","%rsi","%rdx","%rcx","%r8","%r9");

###############################################################################
# Almost Montgomery Multiplication (AMM) for 40-digit number in radix 2^52.
#
# AMM is defined as presented in the paper [1].
#
# The input and output are presented in 2^52 radix domain, i.e.
# |res|, |a|, |b|, |m| are arrays of 40 64-bit qwords with 12 high bits zeroed.
# |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
#
# NB: the AMM implementation does not perform "conditional" subtraction step
# specified in the original algorithm as according to the Lemma 1 from the paper
# [2], the result will be always < 2*m and can be used as a direct input to
# the next AMM iteration. This post-condition is true, provided the correct
# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e. s >= n + 2 * k,
# which matches our case: 2080 > 2048 + 2 * 1.
#
# [1] Gueron, S. Efficient software implementations of modular exponentiation.
# DOI: 10.1007/s13389-012-0031-5
# [2] Gueron, S. Enhanced Montgomery Multiplication.
# DOI: 10.1007/3-540-36400-5_5
#
# void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res,
# const BN_ULONG *a,
# const BN_ULONG *b,
# const BN_ULONG *m,
# BN_ULONG k0);
###############################################################################
{
# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
# input parameters
my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;

my $mask52 = "%rax";
Expand Down Expand Up @@ -545,13 +527,6 @@ sub amm52x40_x1_norm {
___

###############################################################################
# Dual Almost Montgomery Multiplication for 40-digit number in radix 2^52
#
# See description of ossl_rsaz_amm52x40_x1_ifma256() above for details about Almost
# Montgomery Multiplication algorithm and function input parameters description.
#
# This function does two AMMs for two independent inputs, hence dual.
#
# void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG out[2][40],
# const BN_ULONG a[2][40],
# const BN_ULONG b[2][40],
Expand Down Expand Up @@ -706,19 +681,10 @@ sub amm52x40_x1_norm {
}

###############################################################################
# Constant time extraction from the precomputed table of powers base^i, where
# i = 0..2^EXP_WIN_SIZE-1
#
# The input |red_table| contains precomputations for two independent base values.
# |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
#
# Extracted value (output) is 2 40 digits numbers in 2^52 radix.
#
# void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y,
# const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][40],
# int red_table_idx1, int red_table_idx2);
#
# EXP_WIN_SIZE = 5
###############################################################################
{
# input parameters
Expand Down
118 changes: 59 additions & 59 deletions crypto/fipsmodule/bn/exponentiation.c
Original file line number Diff line number Diff line change
Expand Up @@ -1266,78 +1266,78 @@ int BN_mod_exp_mont_consttime_x2(BIGNUM *rr1, const BIGNUM *a1, const BIGNUM *p1
const BIGNUM *m2, const BN_MONT_CTX *in_mont2,
BN_CTX *ctx)
{
int ret = 0;
int ret = 0;

#ifdef RSAZ_512_ENABLED
BN_MONT_CTX *mont1 = NULL;
BN_MONT_CTX *mont2 = NULL;

if (ossl_rsaz_avx512ifma_eligible() &&
(((a1->width == 16) && (p1->width == 16) && (BN_num_bits(m1) == 1024) &&
(a2->width == 16) && (p2->width == 16) && (BN_num_bits(m2) == 1024)) ||
((a1->width == 24) && (p1->width == 24) && (BN_num_bits(m1) == 1536) &&
(a2->width == 24) && (p2->width == 24) && (BN_num_bits(m2) == 1536)) ||
((a1->width == 32) && (p1->width == 32) && (BN_num_bits(m1) == 2048) &&
(a2->width == 32) && (p2->width == 32) && (BN_num_bits(m2) == 2048)))) {

int widthn = a1->width;
/* Modulus bits of |m1| and |m2| are equal */
int mod_bits = BN_num_bits(m1);

if (!bn_wexpand(rr1, widthn))
goto err;
if (!bn_wexpand(rr2, widthn))
goto err;

/* Ensure that montgomery contexts are initialized */
if (in_mont1 == NULL) {
if ((mont1 = BN_MONT_CTX_new()) == NULL)
goto err;
if (!BN_MONT_CTX_set(mont1, m1, ctx))
goto err;
in_mont1 = mont1;
}
if (in_mont2 == NULL) {
if ((mont2 = BN_MONT_CTX_new()) == NULL)
goto err;
if (!BN_MONT_CTX_set(mont2, m2, ctx))
goto err;
in_mont2 = mont2;
}
BN_MONT_CTX *mont1 = NULL;
BN_MONT_CTX *mont2 = NULL;

if (ossl_rsaz_avx512ifma_eligible() &&
(((a1->width == 16) && (p1->width == 16) && (BN_num_bits(m1) == 1024) &&
(a2->width == 16) && (p2->width == 16) && (BN_num_bits(m2) == 1024)) ||
((a1->width == 24) && (p1->width == 24) && (BN_num_bits(m1) == 1536) &&
(a2->width == 24) && (p2->width == 24) && (BN_num_bits(m2) == 1536)) ||
((a1->width == 32) && (p1->width == 32) && (BN_num_bits(m1) == 2048) &&
(a2->width == 32) && (p2->width == 32) && (BN_num_bits(m2) == 2048)))) {

int widthn = a1->width;
/* Modulus bits of |m1| and |m2| are equal */
int mod_bits = BN_num_bits(m1);

if (!bn_wexpand(rr1, widthn))
goto err;
if (!bn_wexpand(rr2, widthn))
goto err;

ret = ossl_rsaz_mod_exp_avx512_x2(rr1->d, a1->d, p1->d, m1->d,
in_mont1->RR.d, in_mont1->n0[0],
rr2->d, a2->d, p2->d, m2->d,
in_mont2->RR.d, in_mont2->n0[0],
mod_bits);
/* Ensure that montgomery contexts are initialized */
if (in_mont1 == NULL) {
if ((mont1 = BN_MONT_CTX_new()) == NULL)
goto err;
if (!BN_MONT_CTX_set(mont1, m1, ctx))
goto err;
in_mont1 = mont1;
}
if (in_mont2 == NULL) {
if ((mont2 = BN_MONT_CTX_new()) == NULL)
goto err;
if (!BN_MONT_CTX_set(mont2, m2, ctx))
goto err;
in_mont2 = mont2;
}

rr1->width = widthn;
rr1->neg = 0;
bn_set_minimal_width(rr1);
ret = ossl_rsaz_mod_exp_avx512_x2(rr1->d, a1->d, p1->d, m1->d,
in_mont1->RR.d, in_mont1->n0[0],
rr2->d, a2->d, p2->d, m2->d,
in_mont2->RR.d, in_mont2->n0[0],
mod_bits);

rr2->width = widthn;
rr2->neg = 0;
bn_set_minimal_width(rr2);
rr1->width = widthn;
rr1->neg = 0;
bn_set_minimal_width(rr1);

goto err;

}
rr2->width = widthn;
rr2->neg = 0;
bn_set_minimal_width(rr2);

goto err;

}
#endif

/* rr1 = a1^p1 mod m1 */
ret = BN_mod_exp_mont_consttime(rr1, a1, p1, m1, ctx, in_mont1);
/* rr2 = a2^p2 mod m2 */
ret &= BN_mod_exp_mont_consttime(rr2, a2, p2, m2, ctx, in_mont2);
/* rr1 = a1^p1 mod m1 */
ret = BN_mod_exp_mont_consttime(rr1, a1, p1, m1, ctx, in_mont1);
/* rr2 = a2^p2 mod m2 */
ret &= BN_mod_exp_mont_consttime(rr2, a2, p2, m2, ctx, in_mont2);

#ifdef RSAZ_512_ENABLED
err:
if (mont2)
BN_MONT_CTX_free(mont2);
if (mont1)
BN_MONT_CTX_free(mont1);
if (mont2)
BN_MONT_CTX_free(mont2);
if (mont1)
BN_MONT_CTX_free(mont1);
#endif

return ret;
return ret;
}

int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
Expand Down
Loading

0 comments on commit 7d9a997

Please sign in to comment.