aws · nebeid · Sep 17, 2024 · Aug 7, 2023 · Oct 23, 2023 · Oct 23, 2023
diff --git a/.github/workflows/mingw.yml b/.github/workflows/mingw.yml
@@ -0,0 +1,42 @@
+name: MinGW
+on:
+  pull_request:
+    branches: [ '*' ]
+  push:
+    branches: [ '*' ]
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.event.pull_request.number }}
+  cancel-in-progress: true
+jobs:
+  mingw:
+    if: github.repository == 'aws/aws-lc'
+    runs-on: windows-latest
+    steps:
+      - name: Install NASM
+        uses: ilammy/[email protected]
+      - name: Checkout
+        uses: actions/checkout@v4
+      - name: Setup MinGW
+        uses: egor-tensin/[email protected]
+        id: setup_mingw
+        with:
+          static: 0
+      - name: Setup CMake
+        uses: threeal/[email protected]
+        with:
+          generator: Ninja
+          c-compiler: ${{ steps.setup_mingw.outputs.gcc }}
+          cxx-compiler: ${{ steps.setup_mingw.outputs.gxx }}
+          options: |
+            CMAKE_SYSTEM_NAME=Windows \
+            CMAKE_SYSTEM_PROCESSOR=x86_64 \
+            CMAKE_BUILD_TOOL=C:/ProgramData/chocolatey/lib/mingw/tools/install/mingw64/bin/ninja.exe \
+            CMAKE_FIND_ROOT_PATH=C:/ProgramData/chocolatey/lib/mingw/tools/install/mingw64 \
+            CMAKE_FIND_ROOT_PATH_MODE_PROGRAM=NEVER \
+            CMAKE_FIND_ROOT_PATH_MODE_LIBRARY=ONLY \
+            CMAKE_FIND_ROOT_PATH_MODE_INCLUDE=ONLY \
+      - name: Build Project
+        run: cmake --build ./build --target all
+      - name: Run tests
+        run: cmake --build ./build --target run_tests
@@ -393,7 +393,7 @@ if(FIPS_DELOCATE)
   # The flags are not required for any other compiler we are running in the CI.
   if (CLANG AND (CMAKE_ASM_COMPILER_ID MATCHES "Clang" OR CMAKE_ASM_COMPILER MATCHES "clang") AND
       (CMAKE_C_COMPILER_VERSION VERSION_LESS "7.0.0") AND (ARCH STREQUAL "x86_64"))
-    set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/bcm-delocated.S PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512dq -mavx512vl")
+    set_source_files_properties(${CMAKE_CURRENT_BINARY_DIR}/bcm-delocated.S PROPERTIES COMPILE_FLAGS "-mavx512f -mavx512bw -mavx512dq -mavx512vl -mavx512ifma")
   endif()
 
   add_library(

@@ -75,54 +75,20 @@
 *STDOUT=*OUT;
 
 if ($avx512ifma>0) {{{
-@_6_args_universal_ABI = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
 
-$code.=<<___;
-.text
-.extern OPENSSL_ia32cap_P
-.globl  ossl_rsaz_avx512ifma_eligible
-.type   ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent
-.align  32
-ossl_rsaz_avx512ifma_eligible:
-    leaq OPENSSL_ia32cap_P(%rip),%r11
-    mov	 8(%r11),%r11d
-    xor %eax,%eax
-    and \$`1<<31|1<<21|1<<17|1<<16`, %r11d     # avx512vl + avx512ifma + avx512dq + avx512f
-    cmp \$`1<<31|1<<21|1<<17|1<<16`, %r11d
-    cmove %r11d,%eax
-    ret
-.size   ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible
-___
+@_6_args_universal_ABI = $win64 ?
+("%rcx","%rdx","%r8","%r9","%r10","%r11") :
+("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
 
 ###############################################################################
-# Almost Montgomery Multiplication (AMM) for 20-digit number in radix 2^52.
-#
-# AMM is defined as presented in the paper [1].
-#
-# The input and output are presented in 2^52 radix domain, i.e.
-#   |res|, |a|, |b|, |m| are arrays of 20 64-bit qwords with 12 high bits zeroed.
-#   |k0| is a Montgomery coefficient, which is here k0 = -1/m mod 2^64
-#
-# NB: the AMM implementation does not perform "conditional" subtraction step
-# specified in the original algorithm as according to the Lemma 1 from the paper
-# [2], the result will be always < 2*m and can be used as a direct input to
-# the next AMM iteration.  This post-condition is true, provided the correct
-# parameter |s| (notion of the Lemma 1 from [2]) is chosen, i.e.  s >= n + 2 * k,
-# which matches our case: 1040 > 1024 + 2 * 1.
-#
-# [1] Gueron, S. Efficient software implementations of modular exponentiation.
-#     DOI: 10.1007/s13389-012-0031-5
-# [2] Gueron, S. Enhanced Montgomery Multiplication.
-#     DOI: 10.1007/3-540-36400-5_5
-#
-# void ossl_rsaz_amm52x20_x1_ifma256(BN_ULONG *res,
+# void rsaz_amm52x20_x1_ifma256(BN_ULONG *res,
 #                                    const BN_ULONG *a,
 #                                    const BN_ULONG *b,
 #                                    const BN_ULONG *m,
 #                                    BN_ULONG k0);
 ###############################################################################
 {
-# input parameters ("%rdi","%rsi","%rdx","%rcx","%r8")
+# input parameters
 my ($res,$a,$b,$m,$k0) = @_6_args_universal_ABI;
 
 my $mask52     = "%rax";
@@ -325,10 +291,10 @@ sub amm52x20_x1_norm {
 $code.=<<___;
 .text
 
-.globl  ossl_rsaz_amm52x20_x1_ifma256
-.type   ossl_rsaz_amm52x20_x1_ifma256,\@function,5
+.globl  rsaz_amm52x20_x1_ifma256
+.type   rsaz_amm52x20_x1_ifma256,\@function,5
 .align 32
-ossl_rsaz_amm52x20_x1_ifma256:
+rsaz_amm52x20_x1_ifma256:
 .cfi_startproc
     endbranch
 #define _CET_ENDBR 
 #define _CET_ENDBR 
     push    %rbx
@@ -343,7 +309,7 @@ sub amm52x20_x1_norm {
 .cfi_push   %r14
     push    %r15
 .cfi_push   %r15
-.Lossl_rsaz_amm52x20_x1_ifma256_body:
+.Lrsaz_amm52x20_x1_ifma256_body:
 
     # Zeroing accumulators
     vpxord   $zero, $zero, $zero
@@ -396,10 +362,10 @@ sub amm52x20_x1_norm {
 .cfi_restore    %rbx
     lea  48(%rsp),%rsp
 .cfi_adjust_cfa_offset  -48
-.Lossl_rsaz_amm52x20_x1_ifma256_epilogue:
+.Lrsaz_amm52x20_x1_ifma256_epilogue:
     ret
 .cfi_endproc
-.size   ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256
+.size   rsaz_amm52x20_x1_ifma256, .-rsaz_amm52x20_x1_ifma256
 ___
 
 $code.=<<___;
@@ -414,27 +380,20 @@ sub amm52x20_x1_norm {
 ___
 
 ###############################################################################
-# Dual Almost Montgomery Multiplication for 20-digit number in radix 2^52
-#
-# See description of ossl_rsaz_amm52x20_x1_ifma256() above for details about Almost
-# Montgomery Multiplication algorithm and function input parameters description.
-#
-# This function does two AMMs for two independent inputs, hence dual.
-#
-# void ossl_rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20],
-#                                    const BN_ULONG a[2][20],
-#                                    const BN_ULONG b[2][20],
-#                                    const BN_ULONG m[2][20],
-#                                    const BN_ULONG k0[2]);
+# void rsaz_amm52x20_x2_ifma256(BN_ULONG out[2][20],
+#                               const BN_ULONG a[2][20],
+#                               const BN_ULONG b[2][20],
+#                               const BN_ULONG m[2][20],
+#                               const BN_ULONG k0[2]);
 ###############################################################################
 
 $code.=<<___;
 .text
 
-.globl  ossl_rsaz_amm52x20_x2_ifma256
-.type   ossl_rsaz_amm52x20_x2_ifma256,\@function,5
+.globl  rsaz_amm52x20_x2_ifma256
+.type   rsaz_amm52x20_x2_ifma256,\@function,5
 .align 32
-ossl_rsaz_amm52x20_x2_ifma256:
+rsaz_amm52x20_x2_ifma256:
 .cfi_startproc
     endbranch
     push    %rbx
@@ -449,7 +408,7 @@ sub amm52x20_x1_norm {
 .cfi_push   %r14
     push    %r15
 .cfi_push   %r15
-.Lossl_rsaz_amm52x20_x2_ifma256_body:
+.Lrsaz_amm52x20_x2_ifma256_body:
 
     # Zeroing accumulators
     vpxord   $zero, $zero, $zero
@@ -514,27 +473,18 @@ sub amm52x20_x1_norm {
 .cfi_restore    %rbx
     lea  48(%rsp),%rsp
 .cfi_adjust_cfa_offset  -48
-.Lossl_rsaz_amm52x20_x2_ifma256_epilogue:
+.Lrsaz_amm52x20_x2_ifma256_epilogue:
     ret
 .cfi_endproc
-.size   ossl_rsaz_amm52x20_x2_ifma256, .-ossl_rsaz_amm52x20_x2_ifma256
+.size   rsaz_amm52x20_x2_ifma256, .-rsaz_amm52x20_x2_ifma256
 ___
 }
 
 ###############################################################################
-# Constant time extraction from the precomputed table of powers base^i, where
-#    i = 0..2^EXP_WIN_SIZE-1
-#
-# The input |red_table| contains precomputations for two independent base values.
-# |red_table_idx1| and |red_table_idx2| are corresponding power indexes.
-#
-# Extracted value (output) is 2 20 digit numbers in 2^52 radix.
-#
-# void ossl_extract_multiplier_2x20_win5(BN_ULONG *red_Y,
+# void extract_multiplier_2x20_win5(BN_ULONG *red_Y,
 #                                        const BN_ULONG red_table[1 << EXP_WIN_SIZE][2][20],
 #                                        int red_table_idx1, int red_table_idx2);
 #
-# EXP_WIN_SIZE = 5
 ###############################################################################
 {
 # input parameters
@@ -553,9 +503,9 @@ sub amm52x20_x1_norm {
 .text
 
 .align 32
-.globl  ossl_extract_multiplier_2x20_win5
-.type   ossl_extract_multiplier_2x20_win5,\@abi-omnipotent
-ossl_extract_multiplier_2x20_win5:
+.globl  extract_multiplier_2x20_win5
+.type   extract_multiplier_2x20_win5,\@abi-omnipotent
+extract_multiplier_2x20_win5:
 .cfi_startproc
     endbranch
     vmovdqa64   .Lones(%rip), $ones         # broadcast ones
@@ -597,7 +547,7 @@ sub amm52x20_x1_norm {
 $code.=<<___;
     ret
 .cfi_endproc
-.size   ossl_extract_multiplier_2x20_win5, .-ossl_extract_multiplier_2x20_win5
+.size   extract_multiplier_2x20_win5, .-extract_multiplier_2x20_win5
 ___
 $code.=<<___;
 .section .rodata
@@ -707,47 +657,39 @@ sub amm52x20_x1_norm {
 
 .section    .pdata
 .align  4
-    .rva    .LSEH_begin_ossl_rsaz_amm52x20_x1_ifma256
-    .rva    .LSEH_end_ossl_rsaz_amm52x20_x1_ifma256
-    .rva    .LSEH_info_ossl_rsaz_amm52x20_x1_ifma256
+    .rva    .LSEH_begin_rsaz_amm52x20_x1_ifma256
+    .rva    .LSEH_end_rsaz_amm52x20_x1_ifma256
+    .rva    .LSEH_info_rsaz_amm52x20_x1_ifma256
 
-    .rva    .LSEH_begin_ossl_rsaz_amm52x20_x2_ifma256
-    .rva    .LSEH_end_ossl_rsaz_amm52x20_x2_ifma256
-    .rva    .LSEH_info_ossl_rsaz_amm52x20_x2_ifma256
+    .rva    .LSEH_begin_rsaz_amm52x20_x2_ifma256
+    .rva    .LSEH_end_rsaz_amm52x20_x2_ifma256
+    .rva    .LSEH_info_rsaz_amm52x20_x2_ifma256
 
 .section    .xdata
 .align  8
-.LSEH_info_ossl_rsaz_amm52x20_x1_ifma256:
+.LSEH_info_rsaz_amm52x20_x1_ifma256:
     .byte   9,0,0,0
     .rva    rsaz_def_handler
-    .rva    .Lossl_rsaz_amm52x20_x1_ifma256_body,.Lossl_rsaz_amm52x20_x1_ifma256_epilogue
-.LSEH_info_ossl_rsaz_amm52x20_x2_ifma256:
+    .rva    .Lrsaz_amm52x20_x1_ifma256_body,.Lrsaz_amm52x20_x1_ifma256_epilogue
+.LSEH_info_rsaz_amm52x20_x2_ifma256:
     .byte   9,0,0,0
     .rva    rsaz_def_handler
-    .rva    .Lossl_rsaz_amm52x20_x2_ifma256_body,.Lossl_rsaz_amm52x20_x2_ifma256_epilogue
+    .rva    .Lrsaz_amm52x20_x2_ifma256_body,.Lrsaz_amm52x20_x2_ifma256_epilogue
 ___
 }
 }}} else {{{                # fallback for old assembler
 $code.=<<___;
 .text
-
-.globl  ossl_rsaz_avx512ifma_eligible
-.type   ossl_rsaz_avx512ifma_eligible,\@abi-omnipotent
-ossl_rsaz_avx512ifma_eligible:
-    xor     %eax,%eax
-    ret
-.size   ossl_rsaz_avx512ifma_eligible, .-ossl_rsaz_avx512ifma_eligible
-
-.globl  ossl_rsaz_amm52x20_x1_ifma256
-.globl  ossl_rsaz_amm52x20_x2_ifma256
-.globl  ossl_extract_multiplier_2x20_win5
-.type   ossl_rsaz_amm52x20_x1_ifma256,\@abi-omnipotent
-ossl_rsaz_amm52x20_x1_ifma256:
-ossl_rsaz_amm52x20_x2_ifma256:
-ossl_extract_multiplier_2x20_win5:
+.globl  rsaz_amm52x20_x1_ifma256
+.globl  rsaz_amm52x20_x2_ifma256
+.globl  extract_multiplier_2x20_win5
+.type   rsaz_amm52x20_x1_ifma256,\@abi-omnipotent
+rsaz_amm52x20_x1_ifma256:
+rsaz_amm52x20_x2_ifma256:
+extract_multiplier_2x20_win5:
     .byte   0x0f,0x0b    # ud2
     ret
-.size   ossl_rsaz_amm52x20_x1_ifma256, .-ossl_rsaz_amm52x20_x1_ifma256
+.size   rsaz_amm52x20_x1_ifma256, .-rsaz_amm52x20_x1_ifma256
 ___
 }}}