From 7d9a99759d60574d87476b3acb26d5b953e63b84 Mon Sep 17 00:00:00 2001
From: dan pittman <dan@dpitt.me>
Date: Mon, 29 Jan 2024 16:00:26 -0800
Subject: [PATCH] address review comments

---
 crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl |  43 +----
 crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl |  48 +----
 crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl |  42 +----
 crypto/fipsmodule/bn/exponentiation.c      | 118 ++++++-------
 crypto/fipsmodule/bn/internal.h            | 196 +++++++++++++++++++++
 crypto/fipsmodule/bn/rsaz_exp_x2.c         |  77 +-------
 6 files changed, 271 insertions(+), 253 deletions(-)

diff --git a/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl b/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl
index 347de51cf29..950b05e86ac 100644
--- a/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl
+++ b/crypto/fipsmodule/bn/asm/rsaz-2k-avx512.pl
@@ -75,7 +75,10 @@
 *STDOUT=*OUT;
 
 if ($avx512ifma>0) {{{
-@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+
+@_6_args_universal_ABI = $win64 ?
+("%rcx","%rdx","%r8","%r9","%r10","%r11") :
+("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
 
 $code.=<<___;
 .text
@@ -95,26 +98,6 @@
 ___
 
 ###############################################################################
-# Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52.
-#
-# AMM is defined as presented in the paper [1].
-#
-# The input and output are presented in 2^52 radix domain, i.e.
-#   |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed.
-#   |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
-#
-# NB: the AMM implementation does not perform "conditional" subtraction step
-# specified in the original algorithm as according to the Lemma 1 from the paper
-# [2], the result will be always < 2*m and can be used as a direct input to
-# the next AMM iteration.  This post-condition is true, provided the correct
-# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k,
-# which matches our case: 1040 > 1024 + 2 * 1.
-#
-# [1] Gueron, S. Efficient software implementations of modular exponentiation.
-#     DOI: 10.1007/s13389-012-0031-5
-# [2] Gueron, S. Enhanced Montgomery Multiplication.
-#     DOI: 10.1007/3-540-36400-5_5
-#
 # void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res,
 #                                    const BN_ULONG *a,
 #                                    const BN_ULONG *b,
@@ -122,7 +105,7 @@
 #                                    BN_ULONG k0);
 ###############################################################################
 {
-# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
+# input parameters
 my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
 
 my $mask52     = "%rax";
@@ -414,13 +397,6 @@ sub amm52x20_x1_norm {
 ___
 
 ###############################################################################
-# Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52
-#
-# See description of ossl_rsaz_amm52x20_x1_ifma256() above for details about Almost
-# Montgomery Multiplication algorithm and function input parameters description.
-#
-# This function does two AMMs for two independent inputs, hence dual.
-#
 # void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20],
 #                                    const BN_ULONG a[2][20],
 #                                    const BN_ULONG b[2][20],
@@ -522,19 +498,10 @@ sub amm52x20_x1_norm {
 }
 
 ###############################################################################
-# Constant time extraction from the precomputed table of powers base^i, where
-#    i = 0..2^EXP_WIN_SIZE-1
-#
-# The input |red_table| contains precomputations for two independent base values.
-# |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
-#
-# Extracted value (output) is 2 20 digit numbers in 2^52 radix.
-#
 # void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y,
 #                                        const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20],
 #                                        int red_table_idx1, int red_table_idx2);
 #
-# EXP_WIN_SIZE = 5
 ###############################################################################
 {
 # input parameters
diff --git a/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl b/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl
index 600f5a3f269..b15b35e75ec 100644
--- a/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl
+++ b/crypto/fipsmodule/bn/asm/rsaz-3k-avx512.pl
@@ -74,32 +74,11 @@
 *STDOUT=*OUT;
 
 if ($avx512ifma>0) {{{
-@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+@_6_args_universal_ABI = $win64 ?
+("%rcx","%rdx","%r8","%r9","%r10","%r11") :
+("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
 
 ###############################################################################
-# Almost Montgomery Multiplication (AMM) for 30-digit number in radix 2^52.
-#
-# AMM is defined as presented in the paper [1].
-#
-# The input and output are presented in 2^52 radix domain, i.e.
-#   |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high bits zeroed
-#
-#   NOTE: the function uses zero-padded data - 2 high QWs is a padding.
-#
-#   |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
-#
-# NB: the AMM implementation does not perform "conditional" subtraction step
-# specified in the original algorithm as according to the Lemma 1 from the paper
-# [2], the result will be always < 2*m and can be used as a direct input to
-# the next AMM iteration.  This post-condition is true, provided the correct
-# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k,
-# which matches our case: 1560 > 1536 + 2 * 1.
-#
-# [1] Gueron, S. Efficient software implementations of modular exponentiation.
-#     DOI: 10.1007/s13389-012-0031-5
-# [2] Gueron, S. Enhanced Montgomery Multiplication.
-#     DOI: 10.1007/3-540-36400-5_5
-#
 # void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res,
 #                                    const BN_ULONG *a,
 #                                    const BN_ULONG *b,
@@ -107,7 +86,7 @@
 #                                    BN_ULONG k0);
 ###############################################################################
 {
-# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
+# input parameters
 my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
 
 my $mask52     = "%rax";
@@ -504,15 +483,6 @@ sub amm52x30_x1_norm {
 ___
 
 ###############################################################################
-# Dual Almost Montgomery Multiplication for 30-digit number in radix 2^52
-#
-# See description of ossl_rsaz_amm52x30_x1_ifma256() above for details about Almost
-# Montgomery Multiplication algorithm and function input parameters description.
-#
-# This function does two AMMs for two independent inputs, hence dual.
-#
-# NOTE: the function uses zero-padded data - 2 high QWs is a padding.
-#
 # void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG out[2][32],
 #                                    const BN_ULONG a[2][32],
 #                                    const BN_ULONG b[2][32],
@@ -659,20 +629,10 @@ sub amm52x30_x1_norm {
 }
 
 ###############################################################################
-# Constant time extraction from the precomputed table of powers base^i, where
-#    i = 0..2^EXP_WIN_SIZE-1
-#
-# The input |red_table| contains precomputations for two independent base values.
-# |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
-#
-# Extracted value (output) is 2 (30 + 2) digits numbers in 2^52 radix.
-# (2 high QW is zero padding)
-#
 # void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y,
 #                                        const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][32],
 #                                        int red_table_idx1, int red_table_idx2);
 #
-# EXP_WIN_SIZE = 5
 ###############################################################################
 {
 # input parameters
diff --git a/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl b/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl
index 9edd148658d..fbdc4d7c050 100644
--- a/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl
+++ b/crypto/fipsmodule/bn/asm/rsaz-4k-avx512.pl
@@ -74,29 +74,11 @@
 *STDOUT=*OUT;
 
 if ($avx512ifma>0) {{{
-@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
+@_6_args_universal_ABI = $win64 ?
+("%rcx","%rdx","%r8","%r9","%r10","%r11") :
+("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
 
 ###############################################################################
-# Almost Montgomery Multiplication (AMM) for 40-digit number in radix 2^52.
-#
-# AMM is defined as presented in the paper [1].
-#
-# The input and output are presented in 2^52 radix domain, i.e.
-#   |res|, |a|, |b|, |m| are arrays of 40 64-bit qwords with 12 high bits zeroed.
-#   |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
-#
-# NB: the AMM implementation does not perform "conditional" subtraction step
-# specified in the original algorithm as according to the Lemma 1 from the paper
-# [2], the result will be always < 2*m and can be used as a direct input to
-# the next AMM iteration.  This post-condition is true, provided the correct
-# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k,
-# which matches our case: 2080 > 2048 + 2 * 1.
-#
-# [1] Gueron, S. Efficient software implementations of modular exponentiation.
-#     DOI: 10.1007/s13389-012-0031-5
-# [2] Gueron, S. Enhanced Montgomery Multiplication.
-#     DOI: 10.1007/3-540-36400-5_5
-#
 # void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res,
 #                                    const BN_ULONG *a,
 #                                    const BN_ULONG *b,
@@ -104,7 +86,7 @@
 #                                    BN_ULONG k0);
 ###############################################################################
 {
-# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
+# input parameters
 my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
 
 my $mask52     = "%rax";
@@ -545,13 +527,6 @@ sub amm52x40_x1_norm {
 ___
 
 ###############################################################################
-# Dual Almost Montgomery Multiplication for 40-digit number in radix 2^52
-#
-# See description of ossl_rsaz_amm52x40_x1_ifma256() above for details about Almost
-# Montgomery Multiplication algorithm and function input parameters description.
-#
-# This function does two AMMs for two independent inputs, hence dual.
-#
 # void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG out[2][40],
 #                                    const BN_ULONG a[2][40],
 #                                    const BN_ULONG b[2][40],
@@ -706,19 +681,10 @@ sub amm52x40_x1_norm {
 }
 
 ###############################################################################
-# Constant time extraction from the precomputed table of powers base^i, where
-#    i = 0..2^EXP_WIN_SIZE-1
-#
-# The input |red_table| contains precomputations for two independent base values.
-# |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
-#
-# Extracted value (output) is 2 40 digits numbers in 2^52 radix.
-#
 # void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y,
 #                                        const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][40],
 #                                        int red_table_idx1, int red_table_idx2);
 #
-# EXP_WIN_SIZE = 5
 ###############################################################################
 {
 # input parameters
diff --git a/crypto/fipsmodule/bn/exponentiation.c b/crypto/fipsmodule/bn/exponentiation.c
index 2d133a900e9..31d1fd3e3d3 100644
--- a/crypto/fipsmodule/bn/exponentiation.c
+++ b/crypto/fipsmodule/bn/exponentiation.c
@@ -1266,78 +1266,78 @@ int BN_mod_exp_mont_consttime_x2(BIGNUM *rr1, const BIGNUM *a1, const BIGNUM *p1
                                  const BIGNUM *m2, const BN_MONT_CTX *in_mont2,
                                  BN_CTX *ctx)
 {
-    int ret = 0;
+  int ret = 0;
 
 #ifdef RSAZ_512_ENABLED
-    BN_MONT_CTX *mont1 = NULL;
-    BN_MONT_CTX *mont2 = NULL;
-
-    if (ossl_rsaz_avx512ifma_eligible() &&
-        (((a1->width == 16) && (p1->width == 16) && (BN_num_bits(m1) == 1024) &&
-          (a2->width == 16) && (p2->width == 16) && (BN_num_bits(m2) == 1024)) ||
-         ((a1->width == 24) && (p1->width == 24) && (BN_num_bits(m1) == 1536) &&
-          (a2->width == 24) && (p2->width == 24) && (BN_num_bits(m2) == 1536)) ||
-         ((a1->width == 32) && (p1->width == 32) && (BN_num_bits(m1) == 2048) &&
-          (a2->width == 32) && (p2->width == 32) && (BN_num_bits(m2) == 2048)))) {
-
-        int widthn = a1->width;
-        /* Modulus bits of |m1| and |m2| are equal */
-        int mod_bits = BN_num_bits(m1);
-
-        if (!bn_wexpand(rr1, widthn))
-            goto err;
-        if (!bn_wexpand(rr2, widthn))
-            goto err;
-
-        /*  Ensure that montgomery contexts are initialized */
-        if (in_mont1 == NULL) {
-          if ((mont1 = BN_MONT_CTX_new()) == NULL)
-              goto err;
-          if (!BN_MONT_CTX_set(mont1, m1, ctx))
-              goto err;
-          in_mont1 = mont1;
-        }
-        if (in_mont2 == NULL) {
-          if ((mont2 = BN_MONT_CTX_new()) == NULL)
-              goto err;
-          if (!BN_MONT_CTX_set(mont2, m2, ctx))
-              goto err;
-          in_mont2 = mont2;
-        }
+  BN_MONT_CTX *mont1 = NULL;
+  BN_MONT_CTX *mont2 = NULL;
+
+  if (ossl_rsaz_avx512ifma_eligible() &&
+    (((a1->width == 16) && (p1->width == 16) && (BN_num_bits(m1) == 1024) &&
+      (a2->width == 16) && (p2->width == 16) && (BN_num_bits(m2) == 1024)) ||
+     ((a1->width == 24) && (p1->width == 24) && (BN_num_bits(m1) == 1536) &&
+      (a2->width == 24) && (p2->width == 24) && (BN_num_bits(m2) == 1536)) ||
+     ((a1->width == 32) && (p1->width == 32) && (BN_num_bits(m1) == 2048) &&
+      (a2->width == 32) && (p2->width == 32) && (BN_num_bits(m2) == 2048)))) {
+
+    int widthn = a1->width;
+    /* Modulus bits of |m1| and |m2| are equal */
+    int mod_bits = BN_num_bits(m1);
+
+    if (!bn_wexpand(rr1, widthn))
+        goto err;
+    if (!bn_wexpand(rr2, widthn))
+        goto err;
 
-        ret = ossl_rsaz_mod_exp_avx512_x2(rr1->d, a1->d, p1->d, m1->d,
-                                          in_mont1->RR.d, in_mont1->n0[0],
-                                          rr2->d, a2->d, p2->d, m2->d,
-                                          in_mont2->RR.d, in_mont2->n0[0],
-                                          mod_bits);
+    /*  Ensure that montgomery contexts are initialized */
+    if (in_mont1 == NULL) {
+      if ((mont1 = BN_MONT_CTX_new()) == NULL)
+        goto err;
+      if (!BN_MONT_CTX_set(mont1, m1, ctx))
+        goto err;
+      in_mont1 = mont1;
+    }
+    if (in_mont2 == NULL) {
+      if ((mont2 = BN_MONT_CTX_new()) == NULL)
+        goto err;
+      if (!BN_MONT_CTX_set(mont2, m2, ctx))
+        goto err;
+      in_mont2 = mont2;
+    }
 
-        rr1->width = widthn;
-        rr1->neg = 0;
-        bn_set_minimal_width(rr1);
+    ret = ossl_rsaz_mod_exp_avx512_x2(rr1->d, a1->d, p1->d, m1->d,
+                                      in_mont1->RR.d, in_mont1->n0[0],
+                                      rr2->d, a2->d, p2->d, m2->d,
+                                      in_mont2->RR.d, in_mont2->n0[0],
+                                      mod_bits);
 
-        rr2->width = widthn;
-        rr2->neg = 0;
-        bn_set_minimal_width(rr2);
+    rr1->width = widthn;
+    rr1->neg = 0;
+    bn_set_minimal_width(rr1);
 
-	goto err;
-	
-    }
+    rr2->width = widthn;
+    rr2->neg = 0;
+    bn_set_minimal_width(rr2);
+
+    goto err;
+
+  }
 #endif
 
-    /* rr1 = a1^p1 mod m1 */
-    ret = BN_mod_exp_mont_consttime(rr1, a1, p1, m1, ctx, in_mont1);
-    /* rr2 = a2^p2 mod m2 */
-    ret &= BN_mod_exp_mont_consttime(rr2, a2, p2, m2, ctx, in_mont2);
+  /* rr1 = a1^p1 mod m1 */
+  ret = BN_mod_exp_mont_consttime(rr1, a1, p1, m1, ctx, in_mont1);
+  /* rr2 = a2^p2 mod m2 */
+  ret &= BN_mod_exp_mont_consttime(rr2, a2, p2, m2, ctx, in_mont2);
 
 #ifdef RSAZ_512_ENABLED
 err:
-    if (mont2)
-        BN_MONT_CTX_free(mont2);
-    if (mont1)
-        BN_MONT_CTX_free(mont1);
+  if (mont2)
+    BN_MONT_CTX_free(mont2);
+  if (mont1)
+    BN_MONT_CTX_free(mont1);
 #endif
 
-    return ret;
+  return ret;
 }
 
 int BN_mod_exp_mont_word(BIGNUM *rr, BN_ULONG a, const BIGNUM *p,
diff --git a/crypto/fipsmodule/bn/internal.h b/crypto/fipsmodule/bn/internal.h
index bf82d32e9ae..1cb0f848242 100644
--- a/crypto/fipsmodule/bn/internal.h
+++ b/crypto/fipsmodule/bn/internal.h
@@ -790,6 +790,202 @@ void bn_little_endian_to_words(BN_ULONG *out, size_t out_len, const uint8_t *in,
 // leading zeros.
 void bn_words_to_little_endian(uint8_t *out, size_t out_len, const BN_ULONG *in, const size_t in_len);
 
+
+// Naming convention for the following functions:
+//
+//   * amm: Almost Montgomery Multiplication
+//   * ams: Almost Montgomery Squaring
+//   * 52xZZ: data represented as array of ZZ digits in 52-bit radix
+//   * _x1_/_x2_:  1 or 2 independent inputs/outputs
+//   * ifma256: uses 256-bit wide IFMA ISA (AVX512_IFMA256)
+//
+//
+// Almost Montgomery Multiplication (AMM) for 20-digit number in radix
+// 2^52.
+//
+// AMM is defined as presented in the paper [1].
+//
+// The input and output are presented in 2^52 radix domain, i.e.
+// |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high
+// bits zeroed.  |k0| is a Montgomery coefficient, which is here k0 =
+// -1/m mod 2^64
+//
+// NB: the AMM implementation does not perform "conditional"
+// subtraction step specified in the original algorithm as according
+// to the Lemma 1 from the paper [2], the result will be always < 2*m
+// and can be used as a direct input to the next AMM iteration.  This
+// post-condition is true, provided the correct parameter |s| (notion
+// of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k, which
+// matches our case: 1040 > 1024 + 2 * 1.
+//
+// [1] Gueron, S. Efficient software implementations of modular
+//     exponentiation.  DOI: 10.1007/s13389-012-0031-5
+// [2] Gueron, S. Enhanced Montgomery Multiplication.  DOI:
+//     10.1007/3-540-36400-5_5
+void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
+                                   const BN_ULONG *b, const BN_ULONG *m,
+                                   BN_ULONG k0);
+
+// Dual Almost Montgomery Multiplication for 20-digit number in radix
+// 2^52
+//
+// See description of ossl_rsaz_amm52x20_x1_ifma256() above for
+// details about Almost Montgomery Multiplication algorithm and
+// function input parameters description.
+//
+// This function does two AMMs for two independent inputs, hence dual.
+void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
+                                   const BN_ULONG *b, const BN_ULONG *m,
+                                   const BN_ULONG k0[2]);
+
+// Constant time extraction from the precomputed table of powers
+// base^i, where i = 0..2^EXP_WIN_SIZE-1
+//
+// The input |red_table| contains precomputations for two independent
+// base values.  |red_table_idx1| and |red_table_idx2| are
+// corresponding power indexes.
+//
+// Extracted value (output) is 2 20 digit numbers in 2^52 radix.
+//
+// EXP_WIN_SIZE = 5
+void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y,
+                                       const BN_ULONG *red_table,
+                                       int red_table_idx1, int red_table_idx2);
+
+// Almost Montgomery Multiplication (AMM) for 30-digit number in radix
+// 2^52.
+//
+// AMM is defined as presented in the paper [1].
+//
+// The input and output are presented in 2^52 radix domain, i.e.
+// |res|, |a|, |b|, |m| are arrays of 32 64-bit qwords with 12 high
+// bits zeroed
+//
+// NOTE: the function uses zero-padded data - 2 high QWs is a padding.
+//
+// |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
+//
+// NB: the AMM implementation does not perform "conditional"
+// subtraction step specified in the original algorithm as according
+// to the Lemma 1 from the paper [2], the result will be always < 2*m
+// and can be used as a direct input to the next AMM iteration.  This
+// post-condition is true, provided the correct parameter |s| (notion
+// of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k, which
+// matches our case: 1560 > 1536 + 2 * 1.
+//
+// [1] Gueron, S. Efficient software implementations of modular
+//     exponentiation.  DOI: 10.1007/s13389-012-0031-5
+// [2] Gueron, S. Enhanced Montgomery Multiplication.  DOI:
+//     10.1007/3-540-36400-5_5
+void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
+                                   const BN_ULONG *b, const BN_ULONG *m,
+                                   BN_ULONG k0);
+// Dual Almost Montgomery Multiplication for 30-digit number in radix
+// 2^52
+//
+// See description of ossl_rsaz_amm52x30_x1_ifma256() above for
+// details about Almost Montgomery Multiplication algorithm and
+// function input parameters description.
+//
+// This function does two AMMs for two independent inputs, hence dual.
+//
+// NOTE: the function uses zero-padded data - 2 high QWs is a padding.
+void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
+                                   const BN_ULONG *b, const BN_ULONG *m,
+                                   const BN_ULONG k0[2]);
+
+// Constant time extraction from the precomputed table of powers
+// base^i, where i = 0..2^EXP_WIN_SIZE-1
+//
+// The input |red_table| contains precomputations for two independent
+// base values.  |red_table_idx1| and |red_table_idx2| are
+// corresponding power indexes.
+//
+// Extracted value (output) is 2 (30 + 2) digits numbers in 2^52
+// radix.  (2 high QW is zero padding)
+//
+// EXP_WIN_SIZE = 5
+void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y,
+                                       const BN_ULONG *red_table,
+                                       int red_table_idx1, int red_table_idx2);
+
+// Almost Montgomery Multiplication (AMM) for 40-digit number in radix
+// 2^52.
+//
+// AMM is defined as presented in the paper [1].
+//
+// The input and output are presented in 2^52 radix domain, i.e.
+// |res|, |a|, |b|, |m| are arrays of 40 64-bit qwords with 12 high
+// bits zeroed.  |k0| is a Montgomery coefficient, which is here k0 =
+// -1/m mod 2^64
+//
+// NB: the AMM implementation does not perform "conditional"
+// subtraction step specified in the original algorithm as according
+// to the Lemma 1 from the paper [2], the result will be always < 2*m
+// and can be used as a direct input to the next AMM iteration.  This
+// post-condition is true, provided the correct parameter |s| (notion
+// of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k, which
+// matches our case: 2080 > 2048 + 2 * 1.
+//
+// [1] Gueron, S. Efficient software implementations of modular
+//     exponentiation.  DOI: 10.1007/s13389-012-0031-5
+// [2] Gueron, S. Enhanced Montgomery Multiplication.  DOI:
+//     10.1007/3-540-36400-5_5
+void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
+                                   const BN_ULONG *b, const BN_ULONG *m,
+                                   BN_ULONG k0);
+
+// Dual Almost Montgomery Multiplication for 40-digit number in radix
+// 2^52
+//
+// See description of ossl_rsaz_amm52x40_x1_ifma256() above for
+// details about Almost Montgomery Multiplication algorithm and
+// function input parameters description.
+//
+// This function does two AMMs for two independent inputs, hence dual.
+void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
+                                   const BN_ULONG *b, const BN_ULONG *m,
+                                   const BN_ULONG k0[2]);
+
+// Constant time extraction from the precomputed table of powers base^i, where
+//    i = 0..2^EXP_WIN_SIZE-1
+//
+// The input |red_table| contains precomputations for two independent base values.
+// |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
+//
+// Extracted value (output) is 2 40 digits numbers in 2^52 radix.
+//
+// EXP_WIN_SIZE = 5
+void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y,
+                                       const BN_ULONG *red_table,
+                                       int red_table_idx1, int red_table_idx2);
+
+// Dual {1024,1536,2048}-bit w-ary modular exponentiation using prime moduli of
+// the same bit size using Almost Montgomery Multiplication, optimized with
+// AVX512_IFMA256 ISA.
+//
+// The parameter w (window size) = 5.
+//
+//  [out] res      - result of modular exponentiation: 2x{20,30,40} qword
+//                   values in 2^52 radix.
+//  [in]  base     - base (2x{20,30,40} qword values in 2^52 radix)
+//  [in]  exp      - array of 2 pointers to {16,24,32} qword values in 2^64 radix.
+//                   Exponent is not converted to redundant representation.
+//  [in]  m        - moduli (2x{20,30,40} qword values in 2^52 radix)
+//  [in]  rr       - Montgomery parameter for 2 moduli:
+//                     RR(1024) = 2^2080 mod m.
+//                     RR(1536) = 2^3120 mod m.
+//                     RR(2048) = 2^4160 mod m.
+//                   (2x{20,30,40} qword values in 2^52 radix)
+//  [in]  k0       - Montgomery parameter for 2 moduli: k0 = -1/m mod 2^64
+//
+// \return (void).
+static int RSAZ_mod_exp_x2_ifma256(BN_ULONG *res, const BN_ULONG *base,
+                                   const BN_ULONG *exp[2], const BN_ULONG *m,
+                                   const BN_ULONG *rr, const BN_ULONG k0[2],
+                                   int modulus_bitsize);
+
+
 #if defined(__cplusplus)
 }  // extern C
 #endif
diff --git a/crypto/fipsmodule/bn/rsaz_exp_x2.c b/crypto/fipsmodule/bn/rsaz_exp_x2.c
index be0467c6bd7..c07fe93388a 100644
--- a/crypto/fipsmodule/bn/rsaz_exp_x2.c
+++ b/crypto/fipsmodule/bn/rsaz_exp_x2.c
@@ -32,11 +32,9 @@ static void *__dummy = &__dummy;
 #include <openssl/crypto.h>
 #include <assert.h>
 #include <string.h>
+#include "../../internal.h"
 #include "rsaz_exp.h"
 
-# define ALIGN_OF(ptr, boundary) \
-    ((unsigned char *)(ptr) + (boundary - (((size_t)(ptr)) & (boundary - 1))))
-
 /* Internal radix */
 # define DIGIT_SIZE (52)
 /* 52-bit mask */
@@ -62,53 +60,6 @@ OPENSSL_INLINE int number_of_digits(int bitsize, int digit_size)
     return (bitsize + digit_size - 1) / digit_size;
 }
 
-/*
- * For details of the methods declared below please refer to
- *    crypto/bn/asm/rsaz-avx512.pl
- *
- * Naming conventions:
- *  amm = Almost Montgomery Multiplication
- *  ams = Almost Montgomery Squaring
- *  52xZZ - data represented as array of ZZ digits in 52-bit radix
- *  _x1_/_x2_ - 1 or 2 independent inputs/outputs
- *  _ifma256 - uses 256-bit wide IFMA ISA (AVX512_IFMA256)
- */
-
-void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
-                                   const BN_ULONG *b, const BN_ULONG *m,
-                                   BN_ULONG k0);
-void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
-                                   const BN_ULONG *b, const BN_ULONG *m,
-                                   const BN_ULONG k0[2]);
-void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y,
-                                       const BN_ULONG *red_table,
-                                       int red_table_idx1, int red_table_idx2);
-
-void ossl_rsaz_amm52x30_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
-                                   const BN_ULONG *b, const BN_ULONG *m,
-                                   BN_ULONG k0);
-void ossl_rsaz_amm52x30_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
-                                   const BN_ULONG *b, const BN_ULONG *m,
-                                   const BN_ULONG k0[2]);
-void ossl_extract_multiplier_2x30_win5(BN_ULONG *red_Y,
-                                       const BN_ULONG *red_table,
-                                       int red_table_idx1, int red_table_idx2);
-
-void ossl_rsaz_amm52x40_x1_ifma256(BN_ULONG *res, const BN_ULONG *a,
-                                   const BN_ULONG *b, const BN_ULONG *m,
-                                   BN_ULONG k0);
-void ossl_rsaz_amm52x40_x2_ifma256(BN_ULONG *out, const BN_ULONG *a,
-                                   const BN_ULONG *b, const BN_ULONG *m,
-                                   const BN_ULONG k0[2]);
-void ossl_extract_multiplier_2x40_win5(BN_ULONG *red_Y,
-                                       const BN_ULONG *red_table,
-                                       int red_table_idx1, int red_table_idx2);
-
-static int RSAZ_mod_exp_x2_ifma256(BN_ULONG *res, const BN_ULONG *base,
-                                   const BN_ULONG *exp[2], const BN_ULONG *m,
-                                   const BN_ULONG *rr, const BN_ULONG k0[2],
-                                   int modulus_bitsize);
-
 /*
  * Dual Montgomery modular exponentiation using prime moduli of the
  * same bit size, optimized with AVX512 ISA.
@@ -198,7 +149,7 @@ int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1,
     storage = (BN_ULONG *)OPENSSL_malloc(storage_len_bytes);
     if (storage == NULL)
         goto err;
-    storage_aligned = (BN_ULONG *)ALIGN_OF(storage, 64);
+    storage_aligned = (BN_ULONG *)align_pointer(storage, 64);
 
     /* Memory layout for red(undant) representations */
     base1_red = storage_aligned;
@@ -273,28 +224,6 @@ int ossl_rsaz_mod_exp_avx512_x2(BN_ULONG *res1,
     return ret;
 }
 
-/*
- * Dual {1024,1536,2048}-bit w-ary modular exponentiation using prime moduli of
- * the same bit size using Almost Montgomery Multiplication, optimized with
- * AVX512_IFMA256 ISA.
- *
- * The parameter w (window size) = 5.
- *
- *  [out] res      - result of modular exponentiation: 2x{20,30,40} qword
- *                   values in 2^52 radix.
- *  [in]  base     - base (2x{20,30,40} qword values in 2^52 radix)
- *  [in]  exp      - array of 2 pointers to {16,24,32} qword values in 2^64 radix.
- *                   Exponent is not converted to redundant representation.
- *  [in]  m        - moduli (2x{20,30,40} qword values in 2^52 radix)
- *  [in]  rr       - Montgomery parameter for 2 moduli:
- *                     RR(1024) = 2^2080 mod m.
- *                     RR(1536) = 2^3120 mod m.
- *                     RR(2048) = 2^4160 mod m.
- *                   (2x{20,30,40} qword values in 2^52 radix)
- *  [in]  k0       - Montgomery parameter for 2 moduli: k0 = -1/m mod 2^64
- *
- * \return (void).
- */
 int RSAZ_mod_exp_x2_ifma256(BN_ULONG *out,
                             const BN_ULONG *base,
                             const BN_ULONG *exp[2],
@@ -381,7 +310,7 @@ int RSAZ_mod_exp_x2_ifma256(BN_ULONG *out,
     if (storage == NULL)
         goto err;
     OPENSSL_cleanse(storage, storage_len_bytes);
-    storage_aligned = (BN_ULONG *)ALIGN_OF(storage, 64);
+    storage_aligned = (BN_ULONG *)align_pointer(storage, 64);
 
     red_Y     = storage_aligned;
     red_X     = red_Y + 2 * red_digits;