diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake index 73e972dd70..7309c800f3 100644 --- a/.CMake/alg_support.cmake +++ b/.CMake/alg_support.cmake @@ -166,6 +166,12 @@ cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_192f_simple "" ON "OQS_ENABL cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_192s_simple "" ON "OQS_ENABLE_SIG_SPHINCS" OFF) cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_256f_simple "" ON "OQS_ENABLE_SIG_SPHINCS" OFF) cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_256s_simple "" ON "OQS_ENABLE_SIG_SPHINCS" OFF) + +option(OQS_ENABLE_SIG_MAYO "Enable mayo algorithm family" ON) +cmake_dependent_option(OQS_ENABLE_SIG_mayo_1 "" ON "OQS_ENABLE_SIG_MAYO" OFF) +cmake_dependent_option(OQS_ENABLE_SIG_mayo_2 "" ON "OQS_ENABLE_SIG_MAYO" OFF) +cmake_dependent_option(OQS_ENABLE_SIG_mayo_3 "" ON "OQS_ENABLE_SIG_MAYO" OFF) +cmake_dependent_option(OQS_ENABLE_SIG_mayo_5 "" ON "OQS_ENABLE_SIG_MAYO" OFF) ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ENABLE_BY_ALG_END if((OQS_MINIMAL_BUILD STREQUAL "ON")) @@ -184,6 +190,8 @@ elseif (${OQS_ALGS_ENABLED} STREQUAL "STD") ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_LIST_STANDARDIZED_ALGS_END elseif(${OQS_ALGS_ENABLED} STREQUAL "NIST_R4") filter_algs("KEM_classic_mceliece_348864;KEM_classic_mceliece_348864f;KEM_classic_mceliece_460896;KEM_classic_mceliece_460896f;KEM_classic_mceliece_6688128;KEM_classic_mceliece_6688128f;KEM_classic_mceliece_6960119;KEM_classic_mceliece_6960119f;KEM_classic_mceliece_8192128;KEM_classic_mceliece_8192128f;KEM_hqc_128;KEM_hqc_192;KEM_hqc_256;KEM_bike_l1;KEM_bike_l3;KEM_bike_l5") +elseif(${OQS_ALGS_ENABLED} STREQUAL "NIST_SIG_ONRAMP") + filter_algs("SIG_mayo_1;SIG_mayo_2;SIG_mayo_3;SIG_mayo_5") else() message(STATUS "Alg enablement unchanged") endif() @@ -495,6 +503,31 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) endif() endif() + +if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux") +if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) + cmake_dependent_option(OQS_ENABLE_SIG_mayo_1_avx2 "" ON "OQS_ENABLE_SIG_mayo_1" OFF) +endif() +endif() + +if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux") +if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) + cmake_dependent_option(OQS_ENABLE_SIG_mayo_2_avx2 "" ON "OQS_ENABLE_SIG_mayo_2" OFF) +endif() +endif() + +if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux") +if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) + cmake_dependent_option(OQS_ENABLE_SIG_mayo_3_avx2 "" ON "OQS_ENABLE_SIG_mayo_3" OFF) +endif() +endif() + +if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux") +if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS)) + cmake_dependent_option(OQS_ENABLE_SIG_mayo_5_avx2 "" ON "OQS_ENABLE_SIG_mayo_5" OFF) +endif() +endif() + ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ENABLE_BY_ALG_CONDITIONAL_END option(OQS_ENABLE_SIG_STFL_XMSS "Enable XMSS algorithm family" OFF) diff --git a/.github/workflows/release-test.yml b/.github/workflows/release-test.yml index 47957f4d20..2a4addd541 100644 --- a/.github/workflows/release-test.yml +++ b/.github/workflows/release-test.yml @@ -17,7 +17,7 @@ on: jobs: oqs-provider-release-test: - if: github.event_name == 'release' || endsWith( github.event.head_commit.message, '[trigger downstream]' ) + if: github.event_name == 'release' || contains( github.event.head_commit.message, '[trigger downstream]' ) runs-on: ubuntu-latest steps: - name: Checkout release tests script diff --git a/.github/workflows/unix.yml b/.github/workflows/unix.yml index 5882d9bc8f..49d520eaee 100644 --- a/.github/workflows/unix.yml +++ b/.github/workflows/unix.yml @@ -112,6 +112,11 @@ jobs: container: openquantumsafe/ci-ubuntu-focal-x86_64:latest CMAKE_ARGS: -DOQS_STRICT_WARNINGS=ON -DOQS_ALGS_ENABLED=NIST_R4 PYTEST_ARGS: --ignore=tests/test_leaks.py --ignore=tests/test_kat_all.py + - name: focal-nistonramp-openssl + runner: ubuntu-latest + container: openquantumsafe/ci-ubuntu-focal-x86_64:latest + CMAKE_ARGS: -DOQS_STRICT_WARNINGS=ON -DOQS_ALGS_ENABLED=NIST_SIG_ONRAMP + PYTEST_ARGS: --ignore=tests/test_leaks.py --ignore=tests/test_kat_all.py - name: jammy-std-openssl3 runner: ubuntu-latest container: openquantumsafe/ci-ubuntu-jammy:latest @@ -256,7 +261,7 @@ jobs: - name: Install dependencies run: env HOMEBREW_NO_AUTO_UPDATE=1 brew install ninja && pip3 install --require-hashes --break-system-packages -r .github/workflows/requirements.txt - name: Patch GCC - run: env HOMEBREW_NO_AUTO_UPDATE=1 brew uninstall --ignore-dependencies gcc@13 && wget https://raw.githubusercontent.com/Homebrew/homebrew-core/eb6dd225d093b66054e18e07d56509cf670793b1/Formula/g/gcc%4013.rb && env HOMEBREW_NO_AUTO_UPDATE=1 brew install --ignore-dependencies gcc@13.rb + run: env HOMEBREW_NO_AUTO_UPDATE=1 brew uninstall --ignore-dependencies gcc@13 && wget https://raw.githubusercontent.com/Homebrew/homebrew-core/eb6dd225d093b66054e18e07d56509cf670793b1/Formula/g/gcc%4013.rb && env HOMEBREW_NO_AUTO_UPDATE=1 brew install --ignore-dependencies --formula gcc@13.rb - name: Get system information run: sysctl -a | grep machdep.cpu - name: Configure diff --git a/CMakeLists.txt b/CMakeLists.txt index 0524a07c5b..ebbd58962f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -200,6 +200,9 @@ endif() if(OQS_ENABLE_SIG_SPHINCS) set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/sig/sphincs/sig_sphincs.h) endif() +if(OQS_ENABLE_SIG_MAYO) + set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/sig/mayo/sig_mayo.h) +endif() ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_INCLUDE_HEADERS_END if(OQS_ENABLE_SIG_STFL_XMSS) set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/sig_stfl/xmss/sig_stfl_xmss.h) diff --git a/CONFIGURE.md b/CONFIGURE.md index 9bae9f5af2..d22c9fa34f 100644 --- a/CONFIGURE.md +++ b/CONFIGURE.md @@ -58,9 +58,9 @@ For a full list of such options and their default values, consult [.CMake/alg_su ## OQS_ALGS_ENABLED -A selected algorithm set is enabled. Possible values are "STD" selecting all algorithms standardized by NIST; "NIST_R4" selecting all algorithms evaluated in round 4 of the NIST PQC competition; "All" (or any other value) selecting all algorithms integrated into liboqs. Parameter setting "STD" minimizes library size but may require re-running code generator scripts in projects integrating `liboqs`; e.g., [oqs-provider](https://github.com/open-quantum-safe/oqs-provider) and [oqs-boringssl](https://github.com/open-quantum-safe/boringssl). +A selected algorithm set is enabled. Possible values are "STD" selecting all algorithms standardized by NIST; "NIST_R4" selecting all algorithms evaluated in round 4 of the NIST PQC competition; "NIST_SIG_ONRAMP" selecting algorithms evaluated in the NIST PQC "onramp" standardization for additional signature schemes; "All" (or any other value) selecting all algorithms integrated into liboqs. Parameter setting "STD" minimizes library size but may require re-running code generator scripts in projects integrating `liboqs`; e.g., [oqs-provider](https://github.com/open-quantum-safe/oqs-provider) and [oqs-boringssl](https://github.com/open-quantum-safe/boringssl). -**Attention**: If you use any predefined value (`STD` or `NIST_R4` as of now) for this variable, the values added via [OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG](#OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG) variables will be ignored. +**Attention**: If you use any predefined value (`STD` or `NIST_R4` or `NIST_SIG_ONRAMP` as of now) for this variable, the values added via [OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG](#OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG) variables will be ignored. **Default**: `All`. diff --git a/PLATFORMS.md b/PLATFORMS.md index 60f695d886..e2220229ae 100644 --- a/PLATFORMS.md +++ b/PLATFORMS.md @@ -62,4 +62,3 @@ In this policy, the words "must" and "must not" specify absolute requirements th - x86 for Windows (Visual Studio Toolchain) - ppc64le for Ubuntu (Focal) - s390x for Ubuntu (Focal) - diff --git a/README.md b/README.md index 2b8122b4d7..b21281e2cf 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,7 @@ All names other than `ML-KEM` and `ML-DSA` are subject to change. `liboqs` makes - **CRYSTALS-Dilithium**: Dilithium2, Dilithium3, Dilithium5 - **Falcon**: Falcon-512, Falcon-1024, Falcon-padded-512, Falcon-padded-1024 +- **MAYO**: MAYO-1, MAYO-2, MAYO-3, MAYO-5† - **ML-DSA**: ML-DSA-44-ipd (alias: ML-DSA-44), ML-DSA-65-ipd (alias: ML-DSA-65), ML-DSA-87-ipd (alias: ML-DSA-87) - **SPHINCS+-SHA2**: SPHINCS+-SHA2-128f-simple, SPHINCS+-SHA2-128s-simple, SPHINCS+-SHA2-192f-simple, SPHINCS+-SHA2-192s-simple, SPHINCS+-SHA2-256f-simple, SPHINCS+-SHA2-256s-simple - **SPHINCS+-SHAKE**: SPHINCS+-SHAKE-128f-simple, SPHINCS+-SHAKE-128s-simple, SPHINCS+-SHAKE-192f-simple, SPHINCS+-SHAKE-192s-simple, SPHINCS+-SHAKE-256f-simple, SPHINCS+-SHAKE-256s-simple @@ -197,6 +198,7 @@ liboqs includes some third party libraries or modules that are licensed differen - `src/sig/dilithium/pqcrystals-*`: public domain (CC0) or Apache License v2.0 - `src/sig/dilithium/pqclean_*`: public domain (CC0), and public domain (CC0) or Apache License v2.0, and public domain (CC0) or MIT, and MIT - src/sig/falcon/pqclean_\*\_aarch64 : Apache License v2.0 +- `src/sig/mayo/*`: Apache License v2.0 - `src/sig/ml_dsa/pqcrystals-*`: public domain (CC0) or Apache License v2.0 - `src/sig/sphincs/pqclean_*`: CC0 (public domain) diff --git a/docs/algorithms/sig/mayo.md b/docs/algorithms/sig/mayo.md new file mode 100644 index 0000000000..3174058f13 --- /dev/null +++ b/docs/algorithms/sig/mayo.md @@ -0,0 +1,62 @@ +# MAYO + +- **Algorithm type**: Digital signature scheme. +- **Main cryptographic assumption**: multivariable quadratic equations, oil and vinegar. +- **Principal submitters**: Ward Beullens, Fabio Campos, Sofía Celi, Basil Hess, Matthias J. Kannwischer. +- **Authors' website**: https://pqmayo.org +- **Specification version**: https://doi.org/10.46586/tches.v2024.i2.252-275. +- **Primary Source**: + - **Source**: https://github.com/PQCMayo/MAYO-C/commit/cde2675ff404b0ae070e7dbc3d962ea0b026a81e with copy_from_upstream patches + - **Implementation license (SPDX-Identifier)**: Apache-2.0 + + +## Parameter set summary + +| Parameter set | Parameter set alias | Security model | Claimed NIST Level | Public key size (bytes) | Secret key size (bytes) | Signature size (bytes) | +|:---------------:|:----------------------|:-----------------|---------------------:|--------------------------:|--------------------------:|-------------------------:| +| MAYO-1 | NA | EUF-CMA | 1 | 1168 | 24 | 321 | +| MAYO-2 | NA | EUF-CMA | 1 | 5488 | 24 | 180 | +| MAYO-3 | NA | EUF-CMA | 3 | 2656 | 32 | 577 | +| MAYO-5 | NA | EUF-CMA | 5 | 5008 | 40 | 838 | + +## MAYO-1 implementation characteristics + +| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage?‡ | +|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------| +| [Primary Source](#primary-source) | opt | All | All | None | True | True | False | +| [Primary Source](#primary-source) | avx2 | x86\_64 | Darwin,Linux | AVX2 | True | True | False | + +Are implementations chosen based on runtime CPU feature detection? **Yes**. + + ‡For an explanation of what this denotes, consult the [Explanation of Terms](#explanation-of-terms) section at the end of this file. + +## MAYO-2 implementation characteristics + +| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? | +|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| +| [Primary Source](#primary-source) | opt | All | All | None | True | True | False | +| [Primary Source](#primary-source) | avx2 | x86\_64 | Darwin,Linux | AVX2 | True | True | False | + +Are implementations chosen based on runtime CPU feature detection? **Yes**. + +## MAYO-3 implementation characteristics + +| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? | +|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| +| [Primary Source](#primary-source) | opt | All | All | None | True | True | False | +| [Primary Source](#primary-source) | avx2 | x86\_64 | Darwin,Linux | AVX2 | True | True | False | + +Are implementations chosen based on runtime CPU feature detection? **Yes**. + +## MAYO-5 implementation characteristics + +| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? | +|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------| +| [Primary Source](#primary-source) | opt | All | All | None | True | True | False | +| [Primary Source](#primary-source) | avx2 | x86\_64 | Darwin,Linux | AVX2 | True | True | True | + +Are implementations chosen based on runtime CPU feature detection? **Yes**. + +## Explanation of Terms + +- **Large Stack Usage**: Implementations identified as having such may cause failures when running in threads or in constrained environments. \ No newline at end of file diff --git a/docs/algorithms/sig/mayo.yml b/docs/algorithms/sig/mayo.yml new file mode 100644 index 0000000000..0d84b9381d --- /dev/null +++ b/docs/algorithms/sig/mayo.yml @@ -0,0 +1,143 @@ +name: MAYO +type: signature +principal-submitters: +- Ward Beullens +- Fabio Campos +- Sofía Celi +- Basil Hess +- Matthias J. Kannwischer +crypto-assumption: multivariable quadratic equations, oil and vinegar +website: https://pqmayo.org +nist-round: 1 +spec-version: https://doi.org/10.46586/tches.v2024.i2.252-275 +primary-upstream: + source: https://github.com/PQCMayo/MAYO-C/commit/cde2675ff404b0ae070e7dbc3d962ea0b026a81e + with copy_from_upstream patches + spdx-license-identifier: Apache-2.0 +parameter-sets: +- name: MAYO-1 + claimed-nist-level: 1 + claimed-security: EUF-CMA + length-public-key: 1168 + length-secret-key: 24 + length-signature: 321 + implementations-switch-on-runtime-cpu-features: true + implementations: + - upstream: primary-upstream + upstream-id: opt + supported-platforms: all + common-crypto: + - SHA3: liboqs + - AES: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false + - upstream: primary-upstream + upstream-id: avx2 + supported-platforms: + - architecture: x86_64 + operating_systems: + - Darwin + - Linux + required_flags: + - avx2 + common-crypto: + - SHA3: liboqs + - AES: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false +- name: MAYO-2 + claimed-nist-level: 1 + claimed-security: EUF-CMA + length-public-key: 5488 + length-secret-key: 24 + length-signature: 180 + implementations-switch-on-runtime-cpu-features: true + implementations: + - upstream: primary-upstream + upstream-id: opt + supported-platforms: all + common-crypto: + - SHA3: liboqs + - AES: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false + - upstream: primary-upstream + upstream-id: avx2 + supported-platforms: + - architecture: x86_64 + operating_systems: + - Darwin + - Linux + required_flags: + - avx2 + common-crypto: + - SHA3: liboqs + - AES: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false +- name: MAYO-3 + claimed-nist-level: 3 + claimed-security: EUF-CMA + length-public-key: 2656 + length-secret-key: 32 + length-signature: 577 + implementations-switch-on-runtime-cpu-features: true + implementations: + - upstream: primary-upstream + upstream-id: opt + supported-platforms: all + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false + - upstream: primary-upstream + upstream-id: avx2 + supported-platforms: + - architecture: x86_64 + operating_systems: + - Darwin + - Linux + required_flags: + - avx2 + common-crypto: + - SHA3: liboqs + - AES: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false +- name: MAYO-5 + claimed-nist-level: 5 + claimed-security: EUF-CMA + length-public-key: 5008 + length-secret-key: 40 + length-signature: 838 + implementations-switch-on-runtime-cpu-features: true + implementations: + - upstream: primary-upstream + upstream-id: opt + supported-platforms: all + common-crypto: + - SHA3: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: false + - upstream: primary-upstream + upstream-id: avx2 + supported-platforms: + - architecture: x86_64 + operating_systems: + - Darwin + - Linux + required_flags: + - avx2 + common-crypto: + - SHA3: liboqs + - AES: liboqs + no-secret-dependent-branching-claimed: true + no-secret-dependent-branching-checked-by-valgrind: true + large-stack-usage: true diff --git a/docs/cbom.json b/docs/cbom.json index 7dd47dc218..358fc28b39 100644 --- a/docs/cbom.json +++ b/docs/cbom.json @@ -1,23 +1,23 @@ { "bomFormat": "CBOM", "specVersion": "1.4-cbom-1.0", - "serialNumber": "urn:uuid:b3ac0f3d-b320-4f0f-bbef-6c535c1e9874", + "serialNumber": "urn:uuid:004d7395-7601-44af-97dd-57c2214e5f60", "version": 1, "metadata": { - "timestamp": "2024-03-05T11:49:42.428605", + "timestamp": "2024-07-11T15:22:22.228289", "component": { "type": "library", - "bom-ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d", + "bom-ref": "pkg:github/open-quantum-safe/liboqs@ca5d956097e10672aaa9bb7994057bcc58291b65", "name": "liboqs", - "version": "1f393bfe3690c6ef1cac9070d166995ce4fb3e9d" + "version": "ca5d956097e10672aaa9bb7994057bcc58291b65" } }, "components": [ { "type": "library", - "bom-ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d", + "bom-ref": "pkg:github/open-quantum-safe/liboqs@ca5d956097e10672aaa9bb7994057bcc58291b65", "name": "liboqs", - "version": "1f393bfe3690c6ef1cac9070d166995ce4fb3e9d" + "version": "ca5d956097e10672aaa9bb7994057bcc58291b65" }, { "type": "crypto-asset", @@ -1539,6 +1539,166 @@ "nistQuantumSecurityLevel": 5 } }, + { + "type": "crypto-asset", + "bom-ref": "alg:MAYO-1:generic", + "name": "MAYO", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "variant": "MAYO-1", + "primitive": "signature", + "implementationLevel": "softwarePlainRam", + "cryptoFunctions": [ + "keygen", + "sign", + "verify" + ], + "implementationPlatform": "generic" + }, + "nistQuantumSecurityLevel": 1 + } + }, + { + "type": "crypto-asset", + "bom-ref": "alg:MAYO-1:x86_64", + "name": "MAYO", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "variant": "MAYO-1", + "primitive": "signature", + "implementationLevel": "softwarePlainRam", + "cryptoFunctions": [ + "keygen", + "sign", + "verify" + ], + "implementationPlatform": "x86_64" + }, + "nistQuantumSecurityLevel": 1 + } + }, + { + "type": "crypto-asset", + "bom-ref": "alg:MAYO-2:generic", + "name": "MAYO", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "variant": "MAYO-2", + "primitive": "signature", + "implementationLevel": "softwarePlainRam", + "cryptoFunctions": [ + "keygen", + "sign", + "verify" + ], + "implementationPlatform": "generic" + }, + "nistQuantumSecurityLevel": 1 + } + }, + { + "type": "crypto-asset", + "bom-ref": "alg:MAYO-2:x86_64", + "name": "MAYO", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "variant": "MAYO-2", + "primitive": "signature", + "implementationLevel": "softwarePlainRam", + "cryptoFunctions": [ + "keygen", + "sign", + "verify" + ], + "implementationPlatform": "x86_64" + }, + "nistQuantumSecurityLevel": 1 + } + }, + { + "type": "crypto-asset", + "bom-ref": "alg:MAYO-3:generic", + "name": "MAYO", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "variant": "MAYO-3", + "primitive": "signature", + "implementationLevel": "softwarePlainRam", + "cryptoFunctions": [ + "keygen", + "sign", + "verify" + ], + "implementationPlatform": "generic" + }, + "nistQuantumSecurityLevel": 3 + } + }, + { + "type": "crypto-asset", + "bom-ref": "alg:MAYO-3:x86_64", + "name": "MAYO", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "variant": "MAYO-3", + "primitive": "signature", + "implementationLevel": "softwarePlainRam", + "cryptoFunctions": [ + "keygen", + "sign", + "verify" + ], + "implementationPlatform": "x86_64" + }, + "nistQuantumSecurityLevel": 3 + } + }, + { + "type": "crypto-asset", + "bom-ref": "alg:MAYO-5:generic", + "name": "MAYO", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "variant": "MAYO-5", + "primitive": "signature", + "implementationLevel": "softwarePlainRam", + "cryptoFunctions": [ + "keygen", + "sign", + "verify" + ], + "implementationPlatform": "generic" + }, + "nistQuantumSecurityLevel": 5 + } + }, + { + "type": "crypto-asset", + "bom-ref": "alg:MAYO-5:x86_64", + "name": "MAYO", + "cryptoProperties": { + "assetType": "algorithm", + "algorithmProperties": { + "variant": "MAYO-5", + "primitive": "signature", + "implementationLevel": "softwarePlainRam", + "cryptoFunctions": [ + "keygen", + "sign", + "verify" + ], + "implementationPlatform": "x86_64" + }, + "nistQuantumSecurityLevel": 5 + } + }, { "type": "crypto-asset", "bom-ref": "alg:ML-DSA-44-ipd:generic", @@ -2168,7 +2328,7 @@ ], "dependencies": [ { - "ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d", + "ref": "pkg:github/open-quantum-safe/liboqs@ca5d956097e10672aaa9bb7994057bcc58291b65", "dependsOn": [ "alg:BIKE-L1:x86_64", "alg:BIKE-L3:x86_64", @@ -2246,6 +2406,14 @@ "alg:Falcon-padded-1024:generic", "alg:Falcon-padded-1024:x86_64", "alg:Falcon-padded-1024:armv8-a", + "alg:MAYO-1:generic", + "alg:MAYO-1:x86_64", + "alg:MAYO-2:generic", + "alg:MAYO-2:x86_64", + "alg:MAYO-3:generic", + "alg:MAYO-3:x86_64", + "alg:MAYO-5:generic", + "alg:MAYO-5:x86_64", "alg:ML-DSA-44-ipd:generic", "alg:ML-DSA-44-ipd:x86_64", "alg:ML-DSA-65-ipd:generic", @@ -2843,6 +3011,68 @@ ], "dependencyType": "uses" }, + { + "ref": "alg:MAYO-1:generic", + "dependsOn": [ + "alg:sha3", + "alg:aes" + ], + "dependencyType": "uses" + }, + { + "ref": "alg:MAYO-1:x86_64", + "dependsOn": [ + "alg:sha3", + "alg:aes" + ], + "dependencyType": "uses" + }, + { + "ref": "alg:MAYO-2:generic", + "dependsOn": [ + "alg:sha3", + "alg:aes" + ], + "dependencyType": "uses" + }, + { + "ref": "alg:MAYO-2:x86_64", + "dependsOn": [ + "alg:sha3", + "alg:aes" + ], + "dependencyType": "uses" + }, + { + "ref": "alg:MAYO-3:generic", + "dependsOn": [ + "alg:sha3" + ], + "dependencyType": "uses" + }, + { + "ref": "alg:MAYO-3:x86_64", + "dependsOn": [ + "alg:sha3", + "alg:aes" + ], + "dependencyType": "uses" + }, + { + "ref": "alg:MAYO-5:generic", + "dependsOn": [ + "alg:sha3" + ], + "dependencyType": "uses" + }, + { + "ref": "alg:MAYO-5:x86_64", + "dependsOn": [ + "alg:sha3", + "alg:aes" + ], + "dependencyType": "uses" + }, { "ref": "alg:ML-DSA-44-ipd:generic", "dependsOn": [ diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml index 3417180c7c..f9582fa74f 100644 --- a/scripts/copy_from_upstream/copy_from_upstream.yml +++ b/scripts/copy_from_upstream/copy_from_upstream.yml @@ -53,6 +53,14 @@ upstreams: sig_meta_path: '{pretty_name_full}_META.yml' sig_scheme_path: '.' patches: [pqcrystals-ml_dsa_ipd.patch] + - + name: pqmayo + git_url: https://github.com/PQCMayo/MAYO-C.git + git_branch: nibbling-mayo + git_commit: cde2675ff404b0ae070e7dbc3d962ea0b026a81e + sig_meta_path: 'META/{pretty_name_full}_META.yml' + sig_scheme_path: '.' + patches: [pqmayo-aes.patch, pqmayo-mem.patch] kems: - name: classic_mceliece @@ -301,3 +309,28 @@ sigs: pqclean_scheme: sphincs-shake-256s-simple pretty_name_full: SPHINCS+-SHAKE-256s-simple signed_msg_order: sig_then_msg + - + name: mayo + default_implementation: opt + upstream_location: pqmayo + schemes: + - + scheme: "1" + pqclean_scheme: mayo-1 + pretty_name_full: MAYO-1 + signed_msg_order: sig_then_msg + - + scheme: "2" + pqclean_scheme: mayo-2 + pretty_name_full: MAYO-2 + signed_msg_order: sig_then_msg + - + scheme: "3" + pqclean_scheme: mayo-3 + pretty_name_full: MAYO-3 + signed_msg_order: sig_then_msg + - + scheme: "5" + pqclean_scheme: mayo-5 + pretty_name_full: MAYO-5 + signed_msg_order: sig_then_msg diff --git a/scripts/copy_from_upstream/patches/pqmayo-aes.patch b/scripts/copy_from_upstream/patches/pqmayo-aes.patch new file mode 100644 index 0000000000..2dd469eed3 --- /dev/null +++ b/scripts/copy_from_upstream/patches/pqmayo-aes.patch @@ -0,0 +1,22 @@ +diff --git a/src/common/aes_ctr.h b/src/common/aes_ctr.h +index c47c01e..c5fd013 100644 +--- a/src/common/aes_ctr.h ++++ b/src/common/aes_ctr.h +@@ -16,8 +16,14 @@ int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); + #define AES_128_CTR AES_128_CTR_NI + #else +-int AES_128_CTR(unsigned char *output, size_t outputByteLen, +- const unsigned char *input, size_t inputByteLen); ++#include ++static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen, ++ const unsigned char *input, size_t inputByteLen) { ++ (void) inputByteLen; ++ uint8_t iv[12] = { 0 }; ++ aes128ctr_prf(output, outputByteLen, input, iv); ++ return (int) outputByteLen; ++} + #endif + + #endif + \ No newline at end of file diff --git a/scripts/copy_from_upstream/patches/pqmayo-mem.patch b/scripts/copy_from_upstream/patches/pqmayo-mem.patch new file mode 100644 index 0000000000..ab47b79a06 --- /dev/null +++ b/scripts/copy_from_upstream/patches/pqmayo-mem.patch @@ -0,0 +1,33 @@ +diff --git a/include/mem.h b/include/mem.h +index 4695847..dc5172c 100644 +--- a/include/mem.h ++++ b/include/mem.h +@@ -40,13 +40,16 @@ static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) { + return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); + } + ++#include + /** + * Clears and frees allocated memory. + * + * @param[out] mem Memory to be cleared and freed. + * @param size Size of memory to be cleared and freed. + */ +-void mayo_secure_free(void *mem, size_t size); ++static inline void mayo_secure_free(void *mem, size_t size) { ++ OQS_MEM_secure_free(mem, size); ++} + + /** + * Clears memory. +@@ -54,6 +57,8 @@ void mayo_secure_free(void *mem, size_t size); + * @param[out] mem Memory to be cleared. + * @param size Size of memory to be cleared. + */ +-void mayo_secure_clear(void *mem, size_t size); ++static inline void mayo_secure_clear(void *mem, size_t size) { ++ OQS_MEM_cleanse(mem, size); ++} + + #endif +\ No newline at end of file diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index a5b64fd294..25a9b74086 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -55,6 +55,10 @@ if(OQS_ENABLE_SIG_SPHINCS) add_subdirectory(sig/sphincs) set(SIG_OBJS ${SIG_OBJS} ${SPHINCS_OBJS}) endif() +if(OQS_ENABLE_SIG_MAYO) + add_subdirectory(sig/mayo) + set(SIG_OBJS ${SIG_OBJS} ${MAYO_OBJS}) +endif() ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ALG_OBJECTS_END if(OQS_ENABLE_SIG_STFL_XMSS) diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt index 73b917e07c..d82b4ea268 100644 --- a/src/common/CMakeLists.txt +++ b/src/common/CMakeLists.txt @@ -26,7 +26,7 @@ else() if (OQS_DIST_X86_64_BUILD OR OQS_USE_AES_INSTRUCTIONS) set(AES_IMPL ${AES_IMPL} aes/aes128_ni.c) set(AES_IMPL ${AES_IMPL} aes/aes256_ni.c) - set_source_files_properties(aes/aes128_ni.c PROPERTIES COMPILE_FLAGS -maes) + set_source_files_properties(aes/aes128_ni.c PROPERTIES COMPILE_FLAGS "-maes -mssse3") set_source_files_properties(aes/aes256_ni.c PROPERTIES COMPILE_FLAGS "-maes -mssse3") elseif (OQS_DIST_ARM64_V8_BUILD) set(AES_IMPL ${AES_IMPL} aes/aes128_armv8.c) diff --git a/src/common/aes/aes.c b/src/common/aes/aes.c index 3ac8794991..01e473b819 100644 --- a/src/common/aes/aes.c +++ b/src/common/aes/aes.c @@ -19,6 +19,18 @@ void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **schedule) { callbacks->AES128_ECB_load_schedule(key, schedule); } +void OQS_AES128_CTR_inc_init(const uint8_t *key, void **_schedule) { + callbacks->AES128_CTR_inc_init(key, _schedule); +} + +void OQS_AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *_schedule) { + callbacks->AES128_CTR_inc_iv(iv, iv_len, _schedule); +} + +void OQS_AES128_CTR_inc_ivu64(uint64_t iv, void *_schedule) { + callbacks->AES128_CTR_inc_ivu64(iv, _schedule); +} + void OQS_AES128_free_schedule(void *schedule) { callbacks->AES128_free_schedule(schedule); } @@ -51,6 +63,10 @@ void OQS_AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len callbacks->AES128_ECB_enc_sch(plaintext, plaintext_len, schedule, ciphertext); } +void OQS_AES128_CTR_inc_stream_iv(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { + callbacks->AES128_CTR_inc_stream_iv(iv, iv_len, schedule, out, out_len); +} + void OQS_AES256_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) { callbacks->AES256_ECB_enc(plaintext, plaintext_len, key, ciphertext); } diff --git a/src/common/aes/aes.h b/src/common/aes/aes.h index 011686b3e9..d0d6d634bc 100644 --- a/src/common/aes/aes.h +++ b/src/common/aes/aes.h @@ -28,6 +28,37 @@ extern "C" { */ void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **ctx); +/** + * Function to initialize a context and fill a key schedule given an initial key for + * use in CTR mode. + * + * @param key Initial Key. + * @param ctx Abstract data structure for a key schedule. + */ +void OQS_AES128_CTR_inc_init(const uint8_t *key, void **ctx); + +/** + * Function to fill a context given an IV for use in CTR mode. + * + * Handles a 12- or 16-byte IV. If a 12-byte IV is given, then 4 counter + * bytes are initialized to all zeros. + * + * @param iv Initialization Vector. + * @param iv_len Length of the initialization vector. + * @param ctx Abstract data structure for IV. + */ +void OQS_AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *ctx); + +/** + * Function to fill a context given an IV for use in CTR mode. + * Handles an 8-byte IV passed as a 64-bit unsigned integer, + * counter bytes are initialized to zero. + * + * @param iv Initialization Vector as 64-bit integer. + * @param ctx Abstract data structure for IV. + */ +void OQS_AES128_CTR_inc_ivu64(uint64_t iv, void *ctx); + /** * Function to free a key schedule. * @@ -55,6 +86,21 @@ void OQS_AES128_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, co */ void OQS_AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); +/** + * AES counter mode keystream generator. A context generated by + * OQS_AES128_CTR_inc_init() is passed rather then a key. + * + * Handles a 12- or 16-byte IV. If a 12-byte IV is given, then 4 counter + * bytes are initialized to all zeros. + * + * @param iv 12- or 16-byte initialization vector. + * @param iv_len Lengh of IV in bytes. + * @param ctx Abstract data structure for a key schedule. + * @param out Pointer to a block of memory which is big enough to contain out_len bytes; the result will be written here. + * @param out_len Length of output bytes to generate. + */ +void OQS_AES128_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *ctx, uint8_t *out, size_t out_len); + /** * Function to fill a key schedule given an initial key for use in ECB mode encryption. * diff --git a/src/common/aes/aes128_armv8.c b/src/common/aes/aes128_armv8.c index b5003ad018..292539fefa 100644 --- a/src/common/aes/aes128_armv8.c +++ b/src/common/aes/aes128_armv8.c @@ -3,15 +3,28 @@ #include #include #include +#include #include #include -#define PQC_AES128_STATESIZE 88 typedef struct { - uint64_t sk_exp[PQC_AES128_STATESIZE]; + uint64_t sk_exp[22]; + uint8_t iv[16]; } aes128ctx; +void oqs_aes128_load_iv_armv8(const uint8_t *iv, size_t iv_len, void *_schedule) { + aes128ctx *ctx = _schedule; + if (iv_len == 12) { + memcpy(ctx->iv, iv, 12); + memset(&ctx->iv[12], 0, 4); + } else if (iv_len == 16) { + memcpy(ctx->iv, iv, 16); + } else { + exit(EXIT_FAILURE); + } +} + // From crypto_core/aes128encrypt/dolbeau/armv8crypto static inline void aes128_armv8_encrypt(const unsigned char *rkeys, const unsigned char *n, unsigned char *out) { uint8x16_t temp = vld1q_u8(n); @@ -62,3 +75,64 @@ void oqs_aes128_ecb_enc_sch_armv8(const uint8_t *plaintext, const size_t plainte oqs_aes128_enc_sch_block_armv8(plaintext + (16 * block), (const void *) ctx->sk_exp, ciphertext + (16 * block)); } } + +static uint32_t UINT32_TO_BE(const uint32_t x) { + union { + uint32_t val; + uint8_t bytes[4]; + } y; + y.bytes[0] = (x >> 24) & 0xFF; + y.bytes[1] = (x >> 16) & 0xFF; + y.bytes[2] = (x >> 8) & 0xFF; + y.bytes[3] = x & 0xFF; + return y.val; +} +#define BE_TO_UINT32(n) (uint32_t)((((uint8_t *) &(n))[0] << 24) | (((uint8_t *) &(n))[1] << 16) | (((uint8_t *) &(n))[2] << 8) | (((uint8_t *) &(n))[3] << 0)) + + +void oqs_aes128_ctr_enc_sch_upd_blks_armv8(void *schedule, uint8_t *out, size_t out_blks) { + aes128ctx *ctx = (aes128ctx *) schedule; + uint8_t *block = ctx->iv; + uint32_t ctr; + uint32_t ctr_be; + memcpy(&ctr_be, &block[12], 4); + ctr = BE_TO_UINT32(ctr_be); + while (out_blks >= 1) { + oqs_aes128_enc_sch_block_armv8(block, schedule, out); + out += 16; + out_blks--; + ctr++; + ctr_be = UINT32_TO_BE(ctr); + memcpy(&block[12], (uint8_t *) &ctr_be, 4); + } +} + +void oqs_aes128_ctr_enc_sch_armv8(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { + uint8_t block[16]; + uint32_t ctr; + uint32_t ctr_be; + memcpy(block, iv, 12); + if (iv_len == 12) { + ctr = 0; + } else if (iv_len == 16) { + memcpy(&ctr_be, &iv[12], 4); + ctr = BE_TO_UINT32(ctr_be); + } else { + exit(EXIT_FAILURE); + } + while (out_len >= 16) { + ctr_be = UINT32_TO_BE(ctr); + memcpy(&block[12], (uint8_t *) &ctr_be, 4); + oqs_aes128_enc_sch_block_armv8(block, schedule, out); + out += 16; + out_len -= 16; + ctr++; + } + if (out_len > 0) { + uint8_t tmp[16]; + ctr_be = UINT32_TO_BE(ctr); + memcpy(&block[12], (uint8_t *) &ctr_be, 4); + oqs_aes128_enc_sch_block_armv8(block, schedule, tmp); + memcpy(out, tmp, out_len); + } +} diff --git a/src/common/aes/aes128_ni.c b/src/common/aes/aes128_ni.c index 0593614503..b08a3041a4 100644 --- a/src/common/aes/aes128_ni.c +++ b/src/common/aes/aes128_ni.c @@ -5,9 +5,16 @@ #include #include #include +#include #include #include +#include + +typedef struct { + __m128i sk_exp[11]; + __m128i iv; +} aes128ctx; // From crypto_core/aes128ncrypt/dolbeau/aesenc-int static inline void aes128ni_setkey_encrypt(const unsigned char *key, __m128i rkeys[11]) { @@ -42,21 +49,39 @@ static inline void aes128ni_setkey_encrypt(const unsigned char *key, __m128i rke } void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule) { - *_schedule = malloc(11 * sizeof(__m128i)); + *_schedule = malloc(sizeof(aes128ctx)); + OQS_EXIT_IF_NULLPTR(*_schedule, "AES"); assert(*_schedule != NULL); - __m128i *schedule = (__m128i *) *_schedule; + __m128i *schedule = ((aes128ctx *) *_schedule)->sk_exp; aes128ni_setkey_encrypt(key, schedule); } +void oqs_aes128_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule) { + aes128ctx *ctx = _schedule; + __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); + if (iv_len == 12) { + const int32_t *ivi = (const int32_t *) iv; + ctx->iv = _mm_shuffle_epi8(_mm_set_epi32(0, ivi[2], ivi[1], ivi[0]), idx); + } else if (iv_len == 16) { + ctx->iv = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)iv), idx); + } else { + exit(EXIT_FAILURE); + } +} + +void oqs_aes128_load_iv_u64_ni(uint64_t iv, void *_schedule) { + aes128ctx *ctx = _schedule; + ctx->iv = _mm_loadl_epi64((__m128i *)&iv); +} + void oqs_aes128_free_schedule_ni(void *schedule) { if (schedule != NULL) { - OQS_MEM_secure_free(schedule, 11 * sizeof(__m128i)); + OQS_MEM_secure_free(schedule, sizeof(aes128ctx)); } } // From crypto_core/aes128encrypt/dolbeau/aesenc-int -static inline void aes128ni_encrypt(const __m128i rkeys[11], const unsigned char *n, unsigned char *out) { - __m128i nv = _mm_loadu_si128((const __m128i *)n); +static inline void aes128ni_encrypt(const __m128i rkeys[11], __m128i nv, unsigned char *out) { __m128i temp = _mm_xor_si128(nv, rkeys[0]); temp = _mm_aesenc_si128(temp, rkeys[1]); temp = _mm_aesenc_si128(temp, rkeys[2]); @@ -71,9 +96,45 @@ static inline void aes128ni_encrypt(const __m128i rkeys[11], const unsigned char _mm_storeu_si128((__m128i *)(out), temp); } +// 4x interleaved encryption +static inline void aes128ni_encrypt_x4(const __m128i rkeys[11], __m128i n0, + __m128i n1, __m128i n2, __m128i n3, + unsigned char *out) { + __m128i temp0 = _mm_xor_si128(n0, rkeys[0]); + __m128i temp1 = _mm_xor_si128(n1, rkeys[0]); + __m128i temp2 = _mm_xor_si128(n2, rkeys[0]); + __m128i temp3 = _mm_xor_si128(n3, rkeys[0]); + +#define AESNENCX4(IDX) \ + temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \ + temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \ + temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \ + temp3 = _mm_aesenc_si128(temp3, rkeys[IDX]) + + AESNENCX4(1); + AESNENCX4(2); + AESNENCX4(3); + AESNENCX4(4); + AESNENCX4(5); + AESNENCX4(6); + AESNENCX4(7); + AESNENCX4(8); + AESNENCX4(9); + + temp0 = _mm_aesenclast_si128(temp0, rkeys[10]); + temp1 = _mm_aesenclast_si128(temp1, rkeys[10]); + temp2 = _mm_aesenclast_si128(temp2, rkeys[10]); + temp3 = _mm_aesenclast_si128(temp3, rkeys[10]); + + _mm_storeu_si128((__m128i *)(out + 0), temp0); + _mm_storeu_si128((__m128i *)(out + 16), temp1); + _mm_storeu_si128((__m128i *)(out + 32), temp2); + _mm_storeu_si128((__m128i *)(out + 48), temp3); +} + void oqs_aes128_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext) { - const __m128i *schedule = (const __m128i *) _schedule; - aes128ni_encrypt(schedule, plaintext, ciphertext); + const __m128i *schedule = ((const aes128ctx *) _schedule)->sk_exp; + aes128ni_encrypt(schedule, _mm_loadu_si128((const __m128i *)plaintext), ciphertext); } void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { @@ -82,3 +143,61 @@ void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_ oqs_aes128_enc_sch_block_ni(plaintext + (16 * block), schedule, ciphertext + (16 * block)); } } + +void oqs_aes128_ctr_enc_sch_upd_blks_ni(void *schedule, uint8_t *out, size_t out_blks) { + aes128ctx *ctx = (aes128ctx *) schedule; + const __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); + + while (out_blks >= 4) { + __m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask); + __m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0)), mask); + __m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(2, 0)), mask); + __m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(3, 0)), mask); + aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); + ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(4, 0)); + out += 64; + out_blks -= 4; + } + while (out_blks >= 1) { + __m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask); + aes128ni_encrypt(schedule, nv0, out); + ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0)); + out += 16; + out_blks--; + } +} + +void oqs_aes128_ctr_enc_sch_ni(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { + __m128i block; + __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0); + if (iv_len == 12) { + const int32_t *ivi = (const int32_t *) iv; + block = _mm_set_epi32(0, ivi[2], ivi[1], ivi[0]); + } else if (iv_len == 16) { + block = _mm_loadu_si128((const __m128i *)iv); + } else { + exit(EXIT_FAILURE); + } + + while (out_len >= 64) { + __m128i nv0 = block; + __m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask); + __m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), mask); + __m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), mask); + aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out); + block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), mask); + out += 64; + out_len -= 64; + } + while (out_len >= 16) { + aes128ni_encrypt(schedule, block, out); + out += 16; + out_len -= 16; + block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask); + } + if (out_len > 0) { + uint8_t tmp[16]; + aes128ni_encrypt(schedule, block, tmp); + memcpy(out, tmp, out_len); + } +} diff --git a/src/common/aes/aes_c.c b/src/common/aes/aes_c.c index 6ee93bc76a..f2ec57a500 100644 --- a/src/common/aes/aes_c.c +++ b/src/common/aes/aes_c.c @@ -574,6 +574,39 @@ static void aes_ecb(unsigned char *out, const unsigned char *in, size_t nblocks, } } +static inline void aes128_ctr_upd_blks(unsigned char *out, size_t outblks, aes128ctx *ctx) { + uint32_t ivw[16]; + size_t i; + uint32_t cc; + uint8_t *iv = ctx->iv; + uint32_t blocks = (uint32_t) outblks; + unsigned int nrounds = 10; + + br_range_dec32le(ivw, 4, iv); + + memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t)); + cc = br_swap32(ivw[3]); + ivw[ 7] = br_swap32(cc + 1); + ivw[11] = br_swap32(cc + 2); + ivw[15] = br_swap32(cc + 3); + + while (outblks >= 4) { + aes_ctr4x(out, ivw, ctx->sk_exp, nrounds); + out += 64; + outblks -= 4; + } + if (outblks > 0) { + unsigned char tmp[64]; + aes_ctr4x(tmp, ivw, ctx->sk_exp, nrounds); + for (i = 0; i < outblks * 16; i++) { + out[i] = tmp[i]; + } + } + br_enc32be(&ctx->iv[12], cc + blocks); +} + static inline void aes256_ctr_upd_blks(unsigned char *out, size_t outblks, aes256ctx *ctx) { uint32_t ivw[16]; size_t i; @@ -725,12 +758,48 @@ void oqs_aes128_load_schedule_no_bitslice(const uint8_t *key, void **_schedule) aes_keysched_no_bitslice(schedule, (const unsigned char *) key, 16); } +void oqs_aes128_load_iv_c(const uint8_t *iv, size_t iv_len, void *_schedule) { + aes128ctx *ctx = _schedule; + if (iv_len == 12) { + memcpy(ctx->iv, iv, 12); + memset(&ctx->iv[12], 0, 4); + } else if (iv_len == 16) { + memcpy(ctx->iv, iv, 16); + } else { + exit(EXIT_FAILURE); + } +} + +void oqs_aes128_load_iv_u64_c(uint64_t iv, void *schedule) { + OQS_EXIT_IF_NULLPTR(schedule, "AES"); + aes128ctx *ctx = (aes128ctx *) schedule; + ctx->iv[7] = (unsigned char)(iv >> 56); + ctx->iv[6] = (unsigned char)(iv >> 48); + ctx->iv[5] = (unsigned char)(iv >> 40); + ctx->iv[4] = (unsigned char)(iv >> 32); + ctx->iv[3] = (unsigned char)(iv >> 24); + ctx->iv[2] = (unsigned char)(iv >> 16); + ctx->iv[1] = (unsigned char)(iv >> 8); + ctx->iv[0] = (unsigned char)iv; + memset(&ctx->iv[8], 0, 8); +} + void oqs_aes128_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { assert(plaintext_len % 16 == 0); const aes128ctx *ctx = (const aes128ctx *) schedule; aes_ecb(ciphertext, plaintext, plaintext_len / 16, ctx->sk_exp, 10); } +void oqs_aes128_ctr_enc_sch_c(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { + const aes128ctx *ctx = (const aes128ctx *) schedule; + aes_ctr(out, out_len, iv, iv_len, ctx->sk_exp, 10); +} + +void oqs_aes128_ctr_enc_sch_upd_blks_c(void *schedule, uint8_t *out, size_t out_blks) { + aes128ctx *ctx = (aes128ctx *) schedule; + aes128_ctr_upd_blks(out, out_blks, ctx); +} + void oqs_aes256_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) { assert(plaintext_len % 16 == 0); const aes256ctx *ctx = (const aes256ctx *) schedule; diff --git a/src/common/aes/aes_impl.c b/src/common/aes/aes_impl.c index ae9be662cf..706a5f186f 100644 --- a/src/common/aes/aes_impl.c +++ b/src/common/aes/aes_impl.c @@ -46,6 +46,26 @@ static void AES128_ECB_load_schedule(const uint8_t *key, void **_schedule) { ); } +static void AES128_CTR_inc_init(const uint8_t *key, void **_schedule) { + AES128_ECB_load_schedule(key, _schedule); +} + +static void AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *_schedule) { + C_OR_NI_OR_ARM( + oqs_aes128_load_iv_c(iv, iv_len, _schedule), + oqs_aes128_load_iv_ni(iv, iv_len, _schedule), + oqs_aes128_load_iv_armv8(iv, iv_len, _schedule) + ); +} + +static void AES128_CTR_inc_ivu64(uint64_t iv, void *_schedule) { + C_OR_NI_OR_ARM( + oqs_aes128_load_iv_u64_c(iv, _schedule), + oqs_aes128_load_iv_u64_ni(iv, _schedule), + (void) iv; (void) _schedule + ); +} + static void AES128_free_schedule(void *schedule) { C_OR_NI_OR_ARM( oqs_aes128_free_schedule_c(schedule), @@ -107,6 +127,14 @@ static void AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_ ); } +static void AES128_CTR_inc_stream_iv(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { + C_OR_NI_OR_ARM( + oqs_aes128_ctr_enc_sch_c(iv, iv_len, schedule, out, out_len), + oqs_aes128_ctr_enc_sch_ni(iv, iv_len, schedule, out, out_len), + oqs_aes128_ctr_enc_sch_armv8(iv, iv_len, schedule, out, out_len) + ); +} + static void AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); static void AES256_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) { @@ -141,19 +169,23 @@ static void AES256_CTR_inc_stream_blks(void *schedule, uint8_t *out, size_t out_ } struct OQS_AES_callbacks aes_default_callbacks = { - AES128_ECB_load_schedule, - AES128_free_schedule, - AES128_ECB_enc, - AES128_ECB_enc_sch, - AES256_ECB_load_schedule, - AES256_CTR_inc_init, - AES256_CTR_inc_iv, - AES256_CTR_inc_ivu64, - AES256_free_schedule, - AES256_ECB_enc, - AES256_ECB_enc_sch, - AES256_CTR_inc_stream_iv, - AES256_CTR_inc_stream_blks, + .AES128_ECB_load_schedule = AES128_ECB_load_schedule, + .AES128_CTR_inc_init = AES128_CTR_inc_init, + .AES128_CTR_inc_iv = AES128_CTR_inc_iv, + .AES128_CTR_inc_ivu64 = AES128_CTR_inc_ivu64, + .AES128_free_schedule = AES128_free_schedule, + .AES128_ECB_enc = AES128_ECB_enc, + .AES128_ECB_enc_sch = AES128_ECB_enc_sch, + .AES128_CTR_inc_stream_iv = AES128_CTR_inc_stream_iv, + .AES256_ECB_load_schedule = AES256_ECB_load_schedule, + .AES256_CTR_inc_init = AES256_CTR_inc_init, + .AES256_CTR_inc_iv = AES256_CTR_inc_iv, + .AES256_CTR_inc_ivu64 = AES256_CTR_inc_ivu64, + .AES256_free_schedule = AES256_free_schedule, + .AES256_ECB_enc = AES256_ECB_enc, + .AES256_ECB_enc_sch = AES256_ECB_enc_sch, + .AES256_CTR_inc_stream_iv = AES256_CTR_inc_stream_iv, + .AES256_CTR_inc_stream_blks = AES256_CTR_inc_stream_blks, }; void OQS_AES_init(void) { diff --git a/src/common/aes/aes_local.h b/src/common/aes/aes_local.h index 4c9942a085..a9001a2e31 100644 --- a/src/common/aes/aes_local.h +++ b/src/common/aes/aes_local.h @@ -3,18 +3,29 @@ #include void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule); +void oqs_aes128_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule); +void oqs_aes128_load_iv_u64_ni(uint64_t iv, void *_schedule); void oqs_aes128_free_schedule_ni(void *schedule); void oqs_aes128_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext); void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); +void oqs_aes128_ctr_enc_sch_ni(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len); +void oqs_aes128_ctr_enc_sch_upd_blks_ni(void *schedule, uint8_t *out, size_t out_len); void oqs_aes128_load_schedule_c(const uint8_t *key, void **_schedule); +void oqs_aes128_load_iv_c(const uint8_t *iv, size_t iv_len, void *_schedule); +void oqs_aes128_load_iv_u64_c(uint64_t iv, void *_schedule); void oqs_aes128_free_schedule_c(void *schedule); void oqs_aes128_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); +void oqs_aes128_ctr_enc_sch_c(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len); +void oqs_aes128_ctr_enc_sch_upd_blks_c(void *schedule, uint8_t *out, size_t out_len); void oqs_aes128_load_schedule_no_bitslice(const uint8_t *key, void **_schedule); +void oqs_aes128_load_iv_armv8(const uint8_t *iv, size_t iv_len, void *_schedule); void oqs_aes128_free_schedule_no_bitslice(void *schedule); void oqs_aes128_enc_sch_block_armv8(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext); void oqs_aes128_ecb_enc_sch_armv8(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); +void oqs_aes128_ctr_enc_sch_armv8(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len); +void oqs_aes128_ctr_enc_sch_upd_blks_armv8(void *schedule, uint8_t *out, size_t out_blks); void oqs_aes256_load_schedule_ni(const uint8_t *key, void **_schedule); void oqs_aes256_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule); diff --git a/src/common/aes/aes_ops.h b/src/common/aes/aes_ops.h index 5a26f75764..a64c47d28d 100644 --- a/src/common/aes/aes_ops.h +++ b/src/common/aes/aes_ops.h @@ -25,6 +25,21 @@ struct OQS_AES_callbacks { */ void (*AES128_ECB_load_schedule)(const uint8_t *key, void **ctx); + /** + * Implementation of function OQS_AES256_CTR_inc_init. + */ + void (*AES128_CTR_inc_init)(const uint8_t *key, void **ctx); + + /** + * Implementation of function OQS_AES256_CTR_inc_iv. + */ + void (*AES128_CTR_inc_iv)(const uint8_t *iv, size_t iv_len, void *ctx); + + /** + * Implementation of function OQS_AES256_CTR_inc_ivu64. + */ + void (*AES128_CTR_inc_ivu64)(uint64_t iv, void *ctx); + /** * Implementation of function OQS_AES128_free_schedule. */ @@ -40,6 +55,11 @@ struct OQS_AES_callbacks { */ void (*AES128_ECB_enc_sch)(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext); + /** + * Implementation of function OQS_AES128_CTR_inc_stream_iv. + */ + void (*AES128_CTR_inc_stream_iv)(const uint8_t *iv, size_t iv_len, const void *ctx, uint8_t *out, size_t out_len); + /** * Implementation of function OQS_AES256_ECB_load_schedule. */ diff --git a/src/common/aes/aes_ossl.c b/src/common/aes/aes_ossl.c index feaff39557..c7dc5b9445 100644 --- a/src/common/aes/aes_ossl.c +++ b/src/common/aes/aes_ossl.c @@ -66,6 +66,67 @@ static void AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_ OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptFinal_ex)(ks->ctx, ciphertext, &outlen)); } +static void AES128_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { + EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new()); + OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL"); + uint8_t iv_ctr[16]; + if (iv_len == 12) { + memcpy(iv_ctr, iv, 12); + iv_ctr[12] = 0; + iv_ctr[13] = 0; + iv_ctr[14] = 0; + iv_ctr[15] = 0; + } else if (iv_len == 16) { + memcpy(iv_ctr, iv, 16); + } else { + exit(EXIT_FAILURE); + } + const struct key_schedule *ks = (const struct key_schedule *) schedule; + OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptInit_ex)(ctr_ctx, oqs_aes_128_ctr(), NULL, ks->key, iv_ctr)); + + SIZE_T_TO_INT_OR_EXIT(out_len, out_len_input_int) + memset(out, 0, (size_t)out_len_input_int); + int out_len_output; + OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptUpdate)(ctr_ctx, out, &out_len_output, out, out_len_input_int)); + OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptFinal_ex)(ctr_ctx, out + out_len_output, &out_len_output)); + OSSL_FUNC(EVP_CIPHER_CTX_free)(ctr_ctx); +} + +static void AES128_CTR_inc_init(const uint8_t *key, void **schedule) { + *schedule = malloc(sizeof(struct key_schedule)); + OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL"); + + struct key_schedule *ks = (struct key_schedule *) *schedule; + EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)(); + OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL"); + + ks->for_ECB = 0; + ks->ctx = ctr_ctx; + memcpy(ks->key, key, 16); +} + +static void AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *schedule) { + OQS_EXIT_IF_NULLPTR(schedule, "OpenSSL"); + struct key_schedule *ks = (struct key_schedule *) schedule; + if (iv_len == 12) { + memcpy(ks->iv, iv, 12); + memset(&ks->iv[12], 0, 4); + } else if (iv_len == 16) { + memcpy(ks->iv, iv, 16); + } else { + exit(EXIT_FAILURE); + } + OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptInit_ex)(ks->ctx, oqs_aes_128_ctr(), NULL, ks->key, ks->iv)); +} + +static void AES128_CTR_inc_ivu64(uint64_t iv, void *schedule) { + OQS_EXIT_IF_NULLPTR(schedule, "OpenSSL"); + struct key_schedule *ks = (struct key_schedule *) schedule; + br_enc64be(ks->iv, iv); + memset(&ks->iv[8], 0, 8); + OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptInit_ex)(ks->ctx, oqs_aes_128_ctr(), NULL, ks->key, ks->iv)); +} + static void AES256_ECB_load_schedule(const uint8_t *key, void **schedule) { *schedule = malloc(sizeof(struct key_schedule)); OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL"); @@ -79,11 +140,12 @@ static void AES256_ECB_load_schedule(const uint8_t *key, void **schedule) { static void AES256_CTR_inc_init(const uint8_t *key, void **schedule) { *schedule = malloc(sizeof(struct key_schedule)); + OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL"); + struct key_schedule *ks = (struct key_schedule *) *schedule; EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)(); - assert(ctr_ctx != NULL); + OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL"); - OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL"); ks->for_ECB = 0; ks->ctx = ctr_ctx; memcpy(ks->key, key, 32); @@ -130,7 +192,7 @@ static void AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_ static void AES256_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) { EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)(); - assert(ctr_ctx != NULL); + OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL"); uint8_t iv_ctr[16]; if (iv_len == 12) { memcpy(iv_ctr, iv, 12); @@ -164,17 +226,21 @@ static void AES256_CTR_inc_stream_blks(void *schedule, uint8_t *out, size_t out_ } struct OQS_AES_callbacks aes_default_callbacks = { - AES128_ECB_load_schedule, - AES128_free_schedule, - AES128_ECB_enc, - AES128_ECB_enc_sch, - AES256_ECB_load_schedule, - AES256_CTR_inc_init, - AES256_CTR_inc_iv, - AES256_CTR_inc_ivu64, - AES256_free_schedule, - AES256_ECB_enc, - AES256_ECB_enc_sch, - AES256_CTR_inc_stream_iv, - AES256_CTR_inc_stream_blks, + .AES128_ECB_load_schedule = AES128_ECB_load_schedule, + .AES128_CTR_inc_init = AES128_CTR_inc_init, + .AES128_CTR_inc_iv = AES128_CTR_inc_iv, + .AES128_CTR_inc_ivu64 = AES128_CTR_inc_ivu64, + .AES128_free_schedule = AES128_free_schedule, + .AES128_ECB_enc = AES128_ECB_enc, + .AES128_ECB_enc_sch = AES128_ECB_enc_sch, + .AES256_ECB_load_schedule = AES256_ECB_load_schedule, + .AES128_CTR_inc_stream_iv = AES128_CTR_inc_stream_iv, + .AES256_CTR_inc_init = AES256_CTR_inc_init, + .AES256_CTR_inc_iv = AES256_CTR_inc_iv, + .AES256_CTR_inc_ivu64 = AES256_CTR_inc_ivu64, + .AES256_free_schedule = AES256_free_schedule, + .AES256_ECB_enc = AES256_ECB_enc, + .AES256_ECB_enc_sch = AES256_ECB_enc_sch, + .AES256_CTR_inc_stream_iv = AES256_CTR_inc_stream_iv, + .AES256_CTR_inc_stream_blks = AES256_CTR_inc_stream_blks, }; diff --git a/src/common/ossl_functions.h b/src/common/ossl_functions.h index aa0ceb127c..438ec1fafa 100644 --- a/src/common/ossl_functions.h +++ b/src/common/ossl_functions.h @@ -25,6 +25,7 @@ VOID_FUNC(void, EVP_MD_CTX_free, (EVP_MD_CTX *ctx), (ctx)) FUNC(EVP_MD_CTX *, EVP_MD_CTX_new, (void), ()) FUNC(int, EVP_MD_CTX_reset, (EVP_MD_CTX *ctx), (ctx)) FUNC(const EVP_CIPHER *, EVP_aes_128_ecb, (void), ()) +FUNC(const EVP_CIPHER *, EVP_aes_128_ctr, (void), ()) FUNC(const EVP_CIPHER *, EVP_aes_256_ecb, (void), ()) FUNC(const EVP_CIPHER *, EVP_aes_256_ctr, (void), ()) #if OPENSSL_VERSION_NUMBER >= 0x30000000L diff --git a/src/common/ossl_helpers.c b/src/common/ossl_helpers.c index 1c73d8b901..76dccb0ef4 100644 --- a/src/common/ossl_helpers.c +++ b/src/common/ossl_helpers.c @@ -18,7 +18,7 @@ static EVP_MD *sha256_ptr, *sha384_ptr, *sha512_ptr, *sha3_256_ptr, *sha3_384_ptr, *sha3_512_ptr, *shake128_ptr, *shake256_ptr; -static EVP_CIPHER *aes128_ecb_ptr, *aes256_ecb_ptr, *aes256_ctr_ptr; +static EVP_CIPHER *aes128_ecb_ptr, *aes128_ctr_ptr, *aes256_ecb_ptr, *aes256_ctr_ptr; static void fetch_ossl_objects(void) { sha256_ptr = OSSL_FUNC(EVP_MD_fetch)(NULL, "SHA256", NULL); @@ -32,12 +32,13 @@ static void fetch_ossl_objects(void) { shake256_ptr = OSSL_FUNC(EVP_MD_fetch)(NULL, "SHAKE256", NULL); aes128_ecb_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-128-ECB", NULL); + aes128_ctr_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-128-CTR", NULL); aes256_ecb_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-256-ECB", NULL); aes256_ctr_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-256-CTR", NULL); if (!sha256_ptr || !sha384_ptr || !sha512_ptr || !sha3_256_ptr || !sha3_384_ptr || !sha3_512_ptr || !shake128_ptr || !shake256_ptr || - !aes128_ecb_ptr || !aes256_ecb_ptr || !aes256_ctr_ptr) { + !aes128_ecb_ptr || !aes128_ctr_ptr || !aes256_ecb_ptr || !aes256_ctr_ptr) { fprintf(stderr, "liboqs warning: OpenSSL initialization failure. Is provider for SHA, SHAKE, AES enabled?\n"); } } @@ -61,6 +62,8 @@ static void free_ossl_objects(void) { shake256_ptr = NULL; OSSL_FUNC(EVP_CIPHER_free)(aes128_ecb_ptr); aes128_ecb_ptr = NULL; + OSSL_FUNC(EVP_CIPHER_free)(aes128_ctr_ptr); + aes128_ctr_ptr = NULL; OSSL_FUNC(EVP_CIPHER_free)(aes256_ecb_ptr); aes256_ecb_ptr = NULL; OSSL_FUNC(EVP_CIPHER_free)(aes256_ctr_ptr); @@ -75,7 +78,7 @@ void oqs_ossl_destroy(void) { #else if (sha256_ptr || sha384_ptr || sha512_ptr || sha3_256_ptr || sha3_384_ptr || sha3_512_ptr || shake128_ptr || shake256_ptr || - aes128_ecb_ptr || aes256_ecb_ptr || aes256_ctr_ptr) { + aes128_ecb_ptr || aes128_ctr_ptr || aes256_ecb_ptr || aes256_ctr_ptr) { free_ossl_objects(); } #endif @@ -235,6 +238,23 @@ const EVP_CIPHER *oqs_aes_128_ecb(void) { #endif } +const EVP_CIPHER *oqs_aes_128_ctr(void) { +#if OPENSSL_VERSION_NUMBER >= 0x30000000L +#if defined(OQS_USE_PTHREADS) + if (pthread_once(&init_once_control, fetch_ossl_objects)) { + return NULL; + } +#else + if (!aes128_ctr_ptr) { + fetch_ossl_objects(); + } +#endif + return aes128_ctr_ptr; +#else + return OSSL_FUNC(EVP_aes_128_ctr)(); +#endif +} + const EVP_CIPHER *oqs_aes_256_ecb(void) { #if OPENSSL_VERSION_NUMBER >= 0x30000000L #if defined(OQS_USE_PTHREADS) diff --git a/src/common/ossl_helpers.h b/src/common/ossl_helpers.h index fe6d34687a..3e1bc9ff25 100644 --- a/src/common/ossl_helpers.h +++ b/src/common/ossl_helpers.h @@ -31,6 +31,8 @@ const EVP_MD *oqs_sha3_512(void); const EVP_CIPHER *oqs_aes_128_ecb(void); +const EVP_CIPHER *oqs_aes_128_ctr(void); + const EVP_CIPHER *oqs_aes_256_ecb(void); const EVP_CIPHER *oqs_aes_256_ctr(void); diff --git a/src/common/pqclean_shims/aes.h b/src/common/pqclean_shims/aes.h index 58ae1e67c9..dc72a9e157 100644 --- a/src/common/pqclean_shims/aes.h +++ b/src/common/pqclean_shims/aes.h @@ -12,6 +12,7 @@ #define AESCTR_NONCEBYTES 12 #define AES_BLOCKBYTES 16 +typedef void *aes128ctx; typedef void *aes256ctx; #define aes256_ecb_keyexp(r, key) OQS_AES256_ECB_load_schedule((key), (r)) @@ -43,4 +44,12 @@ static inline void aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[ OQS_AES256_free_schedule(state); } +static inline void aes128ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[16], uint8_t nonce[12]) { + aes128ctx state; + OQS_AES128_CTR_inc_init(key, &state); + OQS_AES128_CTR_inc_stream_iv(nonce, 12, state, out, outlen); + OQS_AES128_free_schedule(state); +} + + #endif diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake index d2d01c4771..f16421fa43 100644 --- a/src/oqsconfig.h.cmake +++ b/src/oqsconfig.h.cmake @@ -189,6 +189,16 @@ #cmakedefine OQS_ENABLE_SIG_sphincs_shake_256f_simple_avx2 1 #cmakedefine OQS_ENABLE_SIG_sphincs_shake_256s_simple 1 #cmakedefine OQS_ENABLE_SIG_sphincs_shake_256s_simple_avx2 1 + +#cmakedefine OQS_ENABLE_SIG_MAYO 1 +#cmakedefine OQS_ENABLE_SIG_mayo_1 1 +#cmakedefine OQS_ENABLE_SIG_mayo_1_avx2 1 +#cmakedefine OQS_ENABLE_SIG_mayo_2 1 +#cmakedefine OQS_ENABLE_SIG_mayo_2_avx2 1 +#cmakedefine OQS_ENABLE_SIG_mayo_3 1 +#cmakedefine OQS_ENABLE_SIG_mayo_3_avx2 1 +#cmakedefine OQS_ENABLE_SIG_mayo_5 1 +#cmakedefine OQS_ENABLE_SIG_mayo_5_avx2 1 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ALG_ENABLE_DEFINES_END #cmakedefine OQS_ENABLE_SIG_STFL_XMSS 1 diff --git a/src/sig/mayo/CMakeLists.txt b/src/sig/mayo/CMakeLists.txt new file mode 100644 index 0000000000..e049f71344 --- /dev/null +++ b/src/sig/mayo/CMakeLists.txt @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: MIT + +# This file was generated by +# scripts/copy_from_upstream/copy_from_upstream.py + +set(_MAYO_OBJS "") + +if(OQS_ENABLE_SIG_mayo_1) + add_library(mayo_1_opt OBJECT sig_mayo_1.c pqmayo_mayo-1_opt/api.c pqmayo_mayo-1_opt/arithmetic.c pqmayo_mayo-1_opt/mayo.c pqmayo_mayo-1_opt/params.c) + target_compile_options(mayo_1_opt PUBLIC -DMAYO_VARIANT=MAYO_1 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL) + target_include_directories(mayo_1_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-1_opt) + target_include_directories(mayo_1_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(mayo_1_opt PUBLIC -DMAYO_VARIANT=MAYO_1 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL) + set(_MAYO_OBJS ${_MAYO_OBJS} $) +endif() + +if(OQS_ENABLE_SIG_mayo_1_avx2) + add_library(mayo_1_avx2 OBJECT pqmayo_mayo-1_avx2/api.c pqmayo_mayo-1_avx2/arithmetic.c pqmayo_mayo-1_avx2/mayo.c pqmayo_mayo-1_avx2/params.c) + target_include_directories(mayo_1_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-1_avx2) + target_include_directories(mayo_1_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(mayo_1_avx2 PRIVATE -mavx2) + target_compile_options(mayo_1_avx2 PUBLIC -DMAYO_VARIANT=MAYO_1 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL) + set(_MAYO_OBJS ${_MAYO_OBJS} $) +endif() + +if(OQS_ENABLE_SIG_mayo_2) + add_library(mayo_2_opt OBJECT sig_mayo_2.c pqmayo_mayo-2_opt/api.c pqmayo_mayo-2_opt/arithmetic.c pqmayo_mayo-2_opt/mayo.c pqmayo_mayo-2_opt/params.c) + target_compile_options(mayo_2_opt PUBLIC -DMAYO_VARIANT=MAYO_2 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL) + target_include_directories(mayo_2_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-2_opt) + target_include_directories(mayo_2_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(mayo_2_opt PUBLIC -DMAYO_VARIANT=MAYO_2 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL) + set(_MAYO_OBJS ${_MAYO_OBJS} $) +endif() + +if(OQS_ENABLE_SIG_mayo_2_avx2) + add_library(mayo_2_avx2 OBJECT pqmayo_mayo-2_avx2/api.c pqmayo_mayo-2_avx2/arithmetic.c pqmayo_mayo-2_avx2/mayo.c pqmayo_mayo-2_avx2/params.c) + target_include_directories(mayo_2_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-2_avx2) + target_include_directories(mayo_2_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(mayo_2_avx2 PRIVATE -mavx2) + target_compile_options(mayo_2_avx2 PUBLIC -DMAYO_VARIANT=MAYO_2 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL) + set(_MAYO_OBJS ${_MAYO_OBJS} $) +endif() + +if(OQS_ENABLE_SIG_mayo_3) + add_library(mayo_3_opt OBJECT sig_mayo_3.c pqmayo_mayo-3_opt/api.c pqmayo_mayo-3_opt/arithmetic.c pqmayo_mayo-3_opt/mayo.c pqmayo_mayo-3_opt/params.c) + target_compile_options(mayo_3_opt PUBLIC -DMAYO_VARIANT=MAYO_3 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT) + target_include_directories(mayo_3_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-3_opt) + target_include_directories(mayo_3_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(mayo_3_opt PUBLIC -DMAYO_VARIANT=MAYO_3 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT) + set(_MAYO_OBJS ${_MAYO_OBJS} $) +endif() + +if(OQS_ENABLE_SIG_mayo_3_avx2) + add_library(mayo_3_avx2 OBJECT pqmayo_mayo-3_avx2/api.c pqmayo_mayo-3_avx2/arithmetic.c pqmayo_mayo-3_avx2/mayo.c pqmayo_mayo-3_avx2/params.c) + target_include_directories(mayo_3_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-3_avx2) + target_include_directories(mayo_3_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(mayo_3_avx2 PRIVATE -mavx2) + target_compile_options(mayo_3_avx2 PUBLIC -DMAYO_VARIANT=MAYO_3 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT) + set(_MAYO_OBJS ${_MAYO_OBJS} $) +endif() + +if(OQS_ENABLE_SIG_mayo_5) + add_library(mayo_5_opt OBJECT sig_mayo_5.c pqmayo_mayo-5_opt/api.c pqmayo_mayo-5_opt/arithmetic.c pqmayo_mayo-5_opt/mayo.c pqmayo_mayo-5_opt/params.c) + target_compile_options(mayo_5_opt PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT) + target_include_directories(mayo_5_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-5_opt) + target_include_directories(mayo_5_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(mayo_5_opt PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT) + set(_MAYO_OBJS ${_MAYO_OBJS} $) +endif() + +if(OQS_ENABLE_SIG_mayo_5_avx2) + add_library(mayo_5_avx2 OBJECT pqmayo_mayo-5_avx2/api.c pqmayo_mayo-5_avx2/arithmetic.c pqmayo_mayo-5_avx2/mayo.c pqmayo_mayo-5_avx2/params.c) + target_include_directories(mayo_5_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-5_avx2) + target_include_directories(mayo_5_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims) + target_compile_options(mayo_5_avx2 PRIVATE -mavx2) + target_compile_options(mayo_5_avx2 PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT) + set(_MAYO_OBJS ${_MAYO_OBJS} $) +endif() + +set(MAYO_OBJS ${_MAYO_OBJS} PARENT_SCOPE) diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/LICENSE b/src/sig/mayo/pqmayo_mayo-1_avx2/LICENSE new file mode 100644 index 0000000000..8f71f43fee --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/LICENSE @@ -0,0 +1,202 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "{}" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright {yyyy} {name of copyright owner} + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/NOTICE b/src/sig/mayo/pqmayo_mayo-1_avx2/NOTICE new file mode 100644 index 0000000000..53da47c21c --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/NOTICE @@ -0,0 +1,13 @@ +Copyright 2023 the MAYO team. All rights reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. \ No newline at end of file diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/aes_ctr.h b/src/sig/mayo/pqmayo_mayo-1_avx2/aes_ctr.h new file mode 100644 index 0000000000..8e41c30f2c --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/aes_ctr.h @@ -0,0 +1,30 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef AESCTR_H +#define AESCTR_H + +#include +#include + +void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output); +#define AES_ECB_encrypt AES_256_ECB + +#ifdef ENABLE_AESNI +int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +#define AES_128_CTR AES_128_CTR_NI +#else +#include +static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen) { + (void) inputByteLen; + uint8_t iv[12] = { 0 }; + aes128ctr_prf(output, outputByteLen, input, iv); + return (int) outputByteLen; +} +#endif + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/api.c b/src/sig/mayo/pqmayo_mayo-1_avx2/api.c new file mode 100644 index 0000000000..b7e2ef80ce --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/api.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +#define MAYO_PARAMS &MAYO_1 +#else +#define MAYO_PARAMS 0 +#endif + +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { + return mayo_keypair(MAYO_PARAMS, pk, sk); +} + +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk) { + return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk); +} + +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk) { + return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk); +} + +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk) { + return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk); +} + +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk) { + if (siglen != CRYPTO_BYTES) + return -1; + return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk); +} + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/api.h b/src/sig/mayo/pqmayo_mayo-1_avx2/api.h new file mode 100644 index 0000000000..86b7bd545d --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/api.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef api_h +#define api_h + +#include + +#define CRYPTO_SECRETKEYBYTES 24 +#define CRYPTO_PUBLICKEYBYTES 1168 +#define CRYPTO_BYTES 321 + +#define CRYPTO_ALGNAME "MAYO-1" + +#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair) +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +#define crypto_sign MAYO_NAMESPACE(crypto_sign) +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk); + +#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature) +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open) +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk); + +#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify) +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk); + +#endif /* api_h */ + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.c new file mode 100644 index 0000000000..de09954c8a --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { +#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64) + mayo12_m_upper(m_legs, in, out, size); +#else + int m_vecs_stored = 0; + for (int r = 0; r < size; r++) { + for (int c = r; c < size; c++) { + m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored ); + if (r != c) { + m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored ); + } + m_vecs_stored ++; + } + } +#endif +} + +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){ +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + (void) p; + mayo_12_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + (void) p; + mayo_3_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + (void) p; + mayo_5_P1P1t_times_O(P1, O, acc); +#else + #ifndef MAYO_VARIANT + const int m_legs = PARAM_m(p) / 32; + #else + (void) p; + #endif + const int param_o = PARAM_o(p); + const int param_v = PARAM_n(p) - PARAM_o(p);; + + int + bs_mat_entries_used = 0; + for (int r = 0; r < param_v; r++) { + for (int c = r; c < param_v; c++) { + if(c==r) { + bs_mat_entries_used += 1; + continue; + } + for (int k = 0; k < param_o; k += 1) { + +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k)); + vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k)); + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k)); + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k)); +#else + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k)); + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k)); +#endif + + } + bs_mat_entries_used += 1; + } + } +#endif +} + +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_12_Vt_times_L_avx2(L, V_multabs, M); + mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_3_Vt_times_L_avx2(L, V_multabs, M); + mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_5_Vt_times_L_avx2(L, V_multabs, M); + mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mul_add_mat_x_m_mat(param_m / 32, V, L, M, + param_k, param_n - param_o, param_o); + + mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1); + + mul_add_mat_x_m_mat(param_m / 32, V, Pv, + Y, param_k, param_n - param_o, + param_k); +#endif +} + +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int m_legs = PARAM_m(p) / 32; + mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1); + mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o); +#endif +} + + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ] +// [ P3*S2 = P2 ] +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS) { + (void) m; +#if MAYO_AVX + const int n = o + v; + + /* Old approach which is constant time but doesn't have to be */ + unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K + unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + +#if M_MAX == 64 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 96 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 128 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS); + mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#else + NOT IMPLEMENTED +#endif +#else + const int n = o + v; + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS); + mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS); +#endif +} + + +// sample a solution x to Ax = y, with r used as randomness +// require: +// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are +// not important, they will be overwritten by y) in row major order +// - y is a vector with m elements +// - r and x are k*o bytes long +// return: 1 on success, 0 on failure +int sample_solution(const mayo_params_t *p, unsigned char *A, + const unsigned char *y, const unsigned char *r, + unsigned char *x, int k, int o, int m, int A_cols) { + #ifdef MAYO_VARIANT + (void) p; + #endif + unsigned char finished; + int col_upper_bound; + unsigned char correct_column; + + // x <- r + for (int i = 0; i < k * o; i++) { + x[i] = r[i]; + } + + // compute Ar; + unsigned char Ar[M_MAX]; + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = 0; // clear last col of A + } + mat_mul(A, r, Ar, k * o + 1, m, 1); + + // move y - Ar to last column of matrix A + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]); + } + + EF(A, m, k * o + 1); + + // check if last row of A (excluding the last entry of y) is zero + unsigned char full_rank = 0; + for (int i = 0; i < A_cols - 1; i++) { + full_rank |= A[(m - 1) * A_cols + i]; + } + +// It is okay to leak if we need to restart or not +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1); +#endif + + if (full_rank == 0) { + return 0; + } + + // back substitution in constant time + // the index of the first nonzero entry in each row is secret, which makes + // things less efficient + + for (int row = m - 1; row >= 0; row--) { + finished = 0; + col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o); + // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32} + + for (int col = row; col <= col_upper_bound; col++) { + + // Compare two chars in constant time. + // Returns 0x00 if the byte arrays are equal, 0xff otherwise. + correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished; + + unsigned char u = correct_column & A[row * A_cols + A_cols - 1]; + x[col] ^= u; + + for (int i = 0; i < row; i += 8) { + uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8) + ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24) + ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40) + ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56); + + tmp = mul_fx8(u, tmp); + + A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf; + A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf; + A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf; + A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf; + A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf; + A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf; + A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf; + A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf; + } + + finished = finished | correct_column; + } + } + return 1; +} + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.h new file mode 100644 index 0000000000..c2fb4fe334 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.h @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_H +#define ARITHMETIC_H + +#include +#include +#include + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define TARGET_BIG_ENDIAN +#endif + +#if defined(MAYO_AVX) && (M_MAX == 64) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 96) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 128) + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128)) + #include +#endif + +// Calculate P3 = O^T * (P1*O + P2) in KeyGen +#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2) +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3); + +// Calculate Upper in KeyGen +#define m_upper MAYO_NAMESPACE(m_upper) +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size); + +// Calculate acc = (P1+P1^T)*O in expand_sk +#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O) +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc); + +// Calculate M=V*L and Y=V*P1*V^T in Sign +#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt) +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y); + +// Sample solution in Sign +#define sample_solution MAYO_NAMESPACE(sample_solution) +int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols); + +// Calculate SPS = S*P*S^T in Verify +#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS) +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_128.h new file mode 100644 index 0000000000..27b367e940 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_128.h @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_128_H +#define ARITHMETIC_128_H + +#include +#include +#include + +// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_128(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; +} + + +static +inline void vec_add_128(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; + acc[6] ^= in[6]; + acc[7] ^= in[7]; +} + +inline +static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} +inline +static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static + inline void multiply_bins_128(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8); + m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8); + m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8); + m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8); + m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8); + m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8); + m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8); + m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8); + m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8); + m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8); + m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8); + m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8); + m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8); + m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8); + vec_copy_128(bins + 8, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_64.h new file mode 100644 index 0000000000..9f7535c878 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_64.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_64_H +#define ARITHMETIC_64_H + +#include +#include +#include + +// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_64(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; +} + +static +inline void vec_add_64(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; +} + +static +inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +inline +static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} +inline +static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void multiply_bins_64(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4); + m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4); + m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4); + m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4); + m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4); + m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4); + m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4); + m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4); + m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4); + m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4); + m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4); + m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4); + m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4); + m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4); + vec_copy_64(bins + 4, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_96.h new file mode 100644 index 0000000000..86359679fb --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_96.h @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_96_H +#define ARITHMETIC_96_H + +#include +#include +#include + +// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_96(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; +} + +static +inline void vec_add_96(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; +} + +inline +static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} + +inline +static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static +inline void multiply_bins_96(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6); + m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6); + m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6); + m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6); + m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6); + m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6); + m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6); + m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6); + m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6); + m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6); + m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6); + m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6); + m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6); + m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6); + vec_copy_96(bins + 6, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_common.h new file mode 100644 index 0000000000..eeb13dc0bd --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_common.h @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_COMMON_H +#define ARITHMETIC_COMMON_H + +#include +#include + +#define K_OVER_2 ((K_MAX+1)/2) + +static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, + 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, + 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01 +}; + +// +// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper! +// +static inline __m256i tbl32_gf16_multab2( uint8_t b ) { + + __m256i bx = _mm256_set1_epi16( b & 0xf ); + __m256i b1 = _mm256_srli_epi16( bx, 1 ); + + const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0)); + const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1)); + const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2)); + const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3)); + + __m256i mask_1 = _mm256_set1_epi16(1); + __m256i mask_4 = _mm256_set1_epi16(4); + __m256i mask_0 = _mm256_setzero_si256(); + + return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) ) + ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) ) + ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) ) + ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) ); +} + +static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) { + return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f); +} + +static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) { + __m256i multab_l = tbl32_gf16_multab2( b ); + __m256i multab_h = _mm256_slli_epi16( multab_l, 4 ); + + return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) ); +} + +static +inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){ + // build multiplication tables + for (size_t r = 0; r < V_MAX; r++) + { + for (size_t c = 0; c < O_MAX; c+=2) + { + O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4); + } + } +} + + +static +inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){ + // build multiplication tables + size_t r; + for (size_t c = 0; c < V_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4); + } +#if K_MAX % 2 == 1 + V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]); +#endif + } +} + +static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = { + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, + 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, + 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, + 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, + 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, + 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, + 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, + 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, + 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, + 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, + 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, + 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, + 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, + 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, + 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, + 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, + 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, + 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, + 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, + 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, + 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, + 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, + 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, + 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, + 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, + 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, + 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a, + 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a}; + + +static +inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) { + size_t r; + for (size_t c = 0; c < V_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])) + ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4); + } +#if K_MAX % 2 == 1 + S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])); +#endif + } +} + +static +inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) { + // build multiplication tables + size_t r; + for (size_t c = 0; c < O_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) + ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4); + } +#if K_MAX % 2 == 1 + S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ; +#endif + } +} + +static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) { + uint64_t mask_msb = 0x8888888888888888ULL; + uint64_t a_msb; + uint64_t a64 = a; + uint64_t b32 = b; + uint64_t r64 = a64 * (b32 & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 1) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 2) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 3) & 1); + + return r64; +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form.h new file mode 100644 index 0000000000..fa69de0ab2 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form.h @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + + +#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) + + +// +// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/. +// +static inline __m256i tbl32_gf16_multab( uint8_t b ) { + __m256i bx = _mm256_set1_epi16( b & 0xf ); + __m256i b1 = _mm256_srli_epi16( bx, 1 ); + + const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0)); + const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1)); + const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2)); + const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3)); + + __m256i mask_1 = _mm256_set1_epi16(1); + __m256i mask_4 = _mm256_set1_epi16(4); + __m256i mask_0 = _mm256_setzero_si256(); + + return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) ) + ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) ) + ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) ) + ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) ); +} + +/* put matrix in row echelon form with ones on first nonzero entries in constant time*/ +static inline void EF(unsigned char *A, int _nrows, int _ncols) { + + (void) _nrows; + (void) _ncols; + + #define nrows M_MAX + #define ncols (K_MAX * O_MAX + 1) + + #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32) + #define MAX_COLS (AVX_REGS_PER_ROW * 32) + + __m256i _pivot_row[AVX_REGS_PER_ROW]; + __m256i A_avx[AVX_REGS_PER_ROW* M_MAX]; + + unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row; + unsigned char* A_bytes = (unsigned char*) A_avx; + + // load A in the tail of AVX2 registers + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) + { + A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ]; + } + } + + // pivot row is secret, pivot col is not + unsigned char inverse; + int pivot_row = 0; + int pivot_col = MAYO_MAX(MAX_COLS - ncols,0); + for (; pivot_col < MAX_COLS-128; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-96; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-64; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-32; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS; pivot_col++) { + #include "echelon_form_loop.h" + } + + // write the matrix A back + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) { + A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j]; + } + } + mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32); + mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows); +} + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form_loop.h new file mode 100644 index 0000000000..b8b29741c4 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form_loop.h @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 + +int pivot_col_rounded = pivot_col/32; + +int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS); +int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols); +/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/ + +/* zero out pivot row */ +for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) { + _pivot_row[i] = _mm256_set1_epi8(0); +} + +/* try to get a pivot row in constant time */ +unsigned char pivot = 0; +uint32_t pivot_is_zero = -1; +for (int row = pivot_row_lower_bound; + row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) { + uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row); + uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row); + __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) ); + for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j]; + } + pivot = pivot_row_bytes[pivot_col]; + pivot_is_zero = ~ct_compare_32((int) pivot, 0); +} + +/* multiply pivot row by inverse of pivot */ +inverse = inverse_f(pivot); +__m256i inverse_multab = tbl32_gf16_multab(inverse); + +for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]); +} + +/* conditionally write pivot row to the correct row, if there is a nonzero pivot */ +/* eliminate entries below pivot */ +for (int row = pivot_row_lower_bound; row < nrows; row++) { + unsigned char below_pivot = (unsigned char) (ct_is_greater_than(row, pivot_row)); + unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col]; + + __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim); + if (row <= pivot_row_upper_bound) { + __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero); + for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) { + A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^ + _mm256_shuffle_epi8(multab, _pivot_row[col]); + } + } else { + for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]); + } + } +} + +pivot_row += (-(int32_t)(~pivot_is_zero)); + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.c new file mode 100644 index 0000000000..ce1ccd4c71 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define PK_PRF AES_128_CTR + +static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) { + int i; + for (i = 0; i < mdeclen / 2; ++i) { + *mdec++ = m[i] & 0xf; + *mdec++ = m[i] >> 4; + } + + if (mdeclen % 2 == 1) { + *mdec++ = m[i] & 0x0f; + } +} + +static void encode(const unsigned char *m, unsigned char *menc, int mlen) { + int i; + for (i = 0; i < mlen / 2; ++i, m += 2) { + menc[i] = (*m) | (*(m + 1) << 4); + } + + if (mlen % 2 == 1) { + menc[i] = (*m); + } +} + +static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *vPv = _vPv; + uint64_t temp[M_MAX/16] = {0}; + unsigned char *temp_bytes = (unsigned char *) temp; + int k = 0; + for (int i = PARAM_k(p) - 1; i >= 0 ; i--) { + for (int j = i; j < PARAM_k(p); j++) { + // multiply by X (shift up 4 bits) + unsigned char top = temp[k] >> 60; + temp[k] <<= 4; + k--; + for(; k>=0; k--){ + temp[k+1] ^= temp[k] >> 60; + temp[k] <<= 4; + } + // reduce mod f(X) + for (int jj = 0; jj < F_TAIL_LEN; jj++) { + if(jj%2 == 0){ +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#endif + } + else { +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#endif + } + } + + // extract from vPv and add + for(k=0; k < PARAM_m(p)/16; k ++){ + temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]); + } + k--; + } + } + + // add to y + for (int i = 0; i < PARAM_m(p); i+=2) + { +#ifdef TARGET_BIG_ENDIAN + y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + y[i] = t[i] ^ (temp_bytes[i/2] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4); +#endif + + } +} + +static void transpose_16x16_nibbles(uint64_t *M){ + static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f; + static const uint64_t even_bytes = 0x00ff00ff00ff00ff; + static const uint64_t even_2bytes = 0x0000ffff0000ffff; + static const uint64_t even_half = 0x00000000ffffffff; + + for (size_t i = 0; i < 16; i+=2) + { + uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; + M[i ] ^= t << 4; + M[i+1] ^= t; + } + + for (size_t i = 0; i < 16; i+=4) + { + uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes; + uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; + M[i ] ^= (t0 << 8); + M[i+1] ^= (t1 << 8); + M[i+2] ^= t0; + M[i+3] ^= t1; + } + + for (size_t i = 0; i < 4; i++) + { + uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes; + uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes; + + M[i ] ^= t0 << 16; + M[i+ 8] ^= t1 << 16; + M[i+ 4] ^= t0; + M[i+12] ^= t1; + } + + for (size_t i = 0; i < 8; i++) + { + uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; + M[i ] ^= t << 32; + M[i+8] ^= t; + } +} + +#define MAYO_M_OVER_8 ((M_MAX + 7) / 8) + +static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *VtL = _VtL; + int bits_to_shift = 0; + int words_to_shift = 0; + uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0}; + size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16; + const uint64_t *Mi, *Mj; + + for (int i = 0; i <= PARAM_k(p) - 1; ++i) { + for (int j = PARAM_k(p) - 1; j >= i; --j) { + // add the M_i and M_j to A, shifted "down" by l positions + Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + + if (i != j) { + Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + } + + bits_to_shift += 4; + if(bits_to_shift == 64){ + words_to_shift ++; + bits_to_shift = 0; + } + } + } + + for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16) + { + transpose_16x16_nibbles(A + c); + } + + unsigned char tab[F_TAIL_LEN*4] = {0}; + for (size_t i = 0; i < F_TAIL_LEN; i++) + { + tab[4*i] = mul_f(PARAM_f_tail(p)[i],1); + tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2); + tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4); + tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8); + } + + uint64_t low_bit_in_nibble = 0x1111111111111111; + + for (size_t c = 0; c < A_width; c+= 16) + { + for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++) + { + size_t pos = (r/16)*A_width + c + (r%16); + uint64_t t0 = A[pos] & low_bit_in_nibble; + uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble; + uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble; + uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble; + for (size_t t = 0; t < F_TAIL_LEN; t++) + { + A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3]; + } + } + } + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) + A[i] = BSWAP64(A[i]); +#endif + + for (int r = 0; r < PARAM_m(p); r+=16) + { + for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16) + { + for (size_t i = 0; i < 16; i++) + { + decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c)); + } + } + } +} + +// Public API + +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) { + int ret = 0; + + ret = mayo_keypair_compact(p, pk, sk); + if (ret != MAYO_OK) { + goto err; + } + +err: + return ret; +} + +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data + unsigned char y[M_MAX]; // secret data + unsigned char salt[SALT_BYTES_MAX]; // not secret data + unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX], + Vdec[N_MINUS_O_MAX * K_MAX]; // secret data + unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data + unsigned char x[K_MAX * N_MAX]; // not secret data + unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data + unsigned char s[K_MAX * N_MAX]; // not secret data + const unsigned char *seed_sk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data + alignas(32) sk_t sk; // secret data + unsigned char Ox[N_MINUS_O_MAX]; // secret data + // unsigned char Mdigest[DIGEST_BYTES]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1]; + unsigned char *ctrbyte; + unsigned char *vi; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_v_bytes = PARAM_v_bytes(p); + const int param_r_bytes = PARAM_r_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P2_bytes = PARAM_P2_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_A_cols = PARAM_A_cols(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + ret = mayo_expand_sk(p, csk, &sk); + if (ret != MAYO_OK) { + goto err; + } + + seed_sk = csk; + decode(sk.o, O, (param_n - param_o) * param_o); + + // hash message + shake256(tmp, param_digest_bytes, m, mlen); + + uint64_t *P1 = sk.p; + uint64_t *L = P1 + (param_P1_bytes/8); + alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0}; + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + L[i] = BSWAP64(L[i]); + } +#endif + + // choose the randomizer + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(tmp + param_digest_bytes, param_salt_bytes); + #else + if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // hashing to salt + memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk, + param_sk_seed_bytes); + shake256(salt, param_salt_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret +#endif + + // hashing to t + memcpy(tmp + param_digest_bytes, salt, param_salt_bytes); + ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes; + + shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + + decode(tenc, t, param_m); // may not be necessary + + for (int ctr = 0; ctr <= 255; ++ctr) { + *ctrbyte = (unsigned char)ctr; + + shake256(V, param_k * param_v_bytes + param_r_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1); + + // decode the v_i vectors + for (int i = 0; i <= param_k - 1; ++i) { + decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o), + param_n - param_o); + } + + // compute all the V * L matrices. + // compute all the V * P1 * V^T matrices. + alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0}; + V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y); + + compute_rhs(p, Y, t, y); + compute_A(p, Mtmp, A); + + decode(V + param_k * param_v_bytes, r, + param_k * + param_o); + if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) { + break; + } else { + memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t)); + } + } + + // s is already 0 + // TODO: optimize this? + for (int i = 0; i <= param_k - 1; ++i) { + vi = Vdec + i * (param_n - param_o); + mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1); + mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1); + memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o); + } + encode(s, sig, param_n * param_k); + memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes); + *siglen = param_sig_bytes; +err: + mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX); + mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX); + mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1)); + mayo_secure_clear(r, K_MAX * O_MAX + 1); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(&sk, sizeof(sk_t)); + mayo_secure_clear(Ox, N_MINUS_O_MAX); + mayo_secure_clear(tmp, + DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1); + return ret; +} + +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + const int param_sig_bytes = PARAM_sig_bytes(p); + size_t siglen = param_sig_bytes; + ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk); + if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes) + goto err; + + memmove(sm + param_sig_bytes, m, mlen); + *smlen = siglen + mlen; +err: + return ret; +} + +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk) { + const int param_sig_bytes = PARAM_sig_bytes(p); + if (smlen < (size_t)param_sig_bytes) { + return MAYO_ERR; + } + int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm, + pk); + + if (result == MAYO_OK) { + *mlen = smlen - param_sig_bytes; + memmove(m, sm + param_sig_bytes, *mlen); + } + + return result; +} + +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk) { + int ret = MAYO_OK; + unsigned char *seed_sk = csk; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8]; + alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0}; + + unsigned char *seed_pk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_m = PARAM_m(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + // seed_sk $←- B^(sk_seed bytes) + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(seed_sk, param_sk_seed_bytes); + #else + if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // S ← shake256(seedsk, pk seed bytes + O bytes) + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + // seed_pk ← s[0 : pk_seed_bytes] + seed_pk = S; + + // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes]) + decode(S + param_pk_seed_bytes, O, param_v * param_o); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + + int m_legs = param_m / 32; + + uint64_t *P1 = P; + uint64_t *P1O_P2 = P + (param_P1_bytes / 8); + + // compute P3 = O^t * (P1*O + P2) + Ot_times_P1O_P2(p, P1, O, P1O_P2, P3); + + // store seed_pk in cpk + memcpy(cpk, seed_pk, param_pk_seed_bytes); + + alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8]; + + // compute Upper(P3) and store in cpk + m_upper(m_legs, P3, P3_upper, param_o); + + memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes); + + +#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL) +err: +#endif + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2); + return ret; +} + +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *pk) { + #ifdef MAYO_VARIANT + (void)p; + #endif + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes); + pk += param_P1_bytes + param_P2_bytes; + memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes); + return MAYO_OK; +} + +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *sk) { + int ret = MAYO_OK; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + uint64_t *P = sk->p; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_o = PARAM_o(p); + const int param_v = PARAM_v(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + const unsigned char *seed_sk = csk; + unsigned char *seed_pk = S; + + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + decode(S + param_pk_seed_bytes, O, + param_v * param_o); // O = S + PK_SEED_BYTES; + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + uint64_t *P2 = P + (param_P1_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + uint64_t *P1 = P; + // compute L_i = (P1 + P1^t)*O + P2 + uint64_t *L = P2; + P1P1t_times_O(p, P1, O, L); + + // write to sk + memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + return ret; +} + +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *cpk) { + unsigned char tEnc[M_BYTES_MAX]; + unsigned char t[M_MAX]; + unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X) + unsigned char s[K_MAX * N_MAX]; + alignas (64) uint64_t pk[EPK_BYTES_MAX / 8]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX]; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P3_bytes = PARAM_P3_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk); + if (ret != MAYO_OK) { + return MAYO_ERR; + } + + uint64_t *P1 = pk; + uint64_t *P2 = pk + (param_P1_bytes / 8); + uint64_t *P3 = P2 + (param_P2_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + P2[i] = BSWAP64(P2[i]); + } + for (int i = 0; i < param_P3_bytes / 8; ++i) { + P3[i] = BSWAP64(P3[i]); + } +#endif + + // hash m + shake256(tmp, param_digest_bytes, m, mlen); + + // compute t + memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes, + param_salt_bytes); + shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + decode(tEnc, t, param_m); + + // decode s + decode(sig, s, param_k * param_n); + + // Compute S*P*S^T + alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0}; + + m_calculate_PS_SPS(P1, P2, P3, s, param_m, + param_v, param_o, param_k, SPS); + + // combine the vectors in SPS and reduce mod f(X) + compute_rhs(p, SPS, y, y); + + if (memcmp(y, t, param_m) == 0) { + return MAYO_OK; // good signature + } + return MAYO_ERR; // bad signature +} + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.h new file mode 100644 index 0000000000..1de4bf2a33 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.h @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MAYO_H +#define MAYO_H + +#include +#include + +#define F_TAIL_LEN 5 +#define F_TAIL_64 \ + { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3 +#define F_TAIL_96 \ + { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x +#define F_TAIL_128 \ + { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2 + +#define MAYO_1_name "MAYO_1" +#define MAYO_1_n 66 +#define MAYO_1_m 64 +#define MAYO_1_o 8 +#define MAYO_1_v (MAYO_1_n - MAYO_1_o) +#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1) +#define MAYO_1_k 9 +#define MAYO_1_q 16 +#define MAYO_1_m_bytes 32 +#define MAYO_1_O_bytes 232 +#define MAYO_1_v_bytes 29 +#define MAYO_1_r_bytes 36 +#define MAYO_1_P1_bytes 54752 +#define MAYO_1_P2_bytes 14848 +#define MAYO_1_P3_bytes 1152 +#define MAYO_1_csk_bytes 24 +#define MAYO_1_esk_bytes 69856 +#define MAYO_1_cpk_bytes 1168 +#define MAYO_1_epk_bytes 70752 +#define MAYO_1_sig_bytes 321 +#define MAYO_1_f_tail F_TAIL_64 +#define MAYO_1_f_tail_arr f_tail_64 +#define MAYO_1_salt_bytes 24 +#define MAYO_1_digest_bytes 32 +#define MAYO_1_pk_seed_bytes 16 +#define MAYO_1_sk_seed_bytes 24 + +#define MAYO_2_name "MAYO_2" +#define MAYO_2_n 78 +#define MAYO_2_m 64 +#define MAYO_2_o 18 +#define MAYO_2_v (MAYO_2_n - MAYO_2_o) +#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1) +#define MAYO_2_k 4 +#define MAYO_2_q 16 +#define MAYO_2_m_bytes 32 +#define MAYO_2_O_bytes 540 +#define MAYO_2_v_bytes 30 +#define MAYO_2_r_bytes 36 +#define MAYO_2_P1_bytes 58560 +#define MAYO_2_P2_bytes 34560 +#define MAYO_2_P3_bytes 5472 +#define MAYO_2_csk_bytes 24 +#define MAYO_2_esk_bytes 93684 +#define MAYO_2_cpk_bytes 5488 +#define MAYO_2_epk_bytes 98592 +#define MAYO_2_sig_bytes 180 +#define MAYO_2_f_tail F_TAIL_64 +#define MAYO_2_f_tail_arr f_tail_64 +#define MAYO_2_salt_bytes 24 +#define MAYO_2_digest_bytes 32 +#define MAYO_2_pk_seed_bytes 16 +#define MAYO_2_sk_seed_bytes 24 + +#define MAYO_3_name "MAYO_3" +#define MAYO_3_n 99 +#define MAYO_3_m 96 +#define MAYO_3_o 10 +#define MAYO_3_v (MAYO_3_n - MAYO_3_o) +#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1) +#define MAYO_3_k 11 +#define MAYO_3_q 16 +#define MAYO_3_m_bytes 48 +#define MAYO_3_O_bytes 445 +#define MAYO_3_v_bytes 45 +#define MAYO_3_r_bytes 55 +#define MAYO_3_P1_bytes 192240 +#define MAYO_3_P2_bytes 42720 +#define MAYO_3_P3_bytes 2640 +#define MAYO_3_csk_bytes 32 +#define MAYO_3_esk_bytes 235437 +#define MAYO_3_cpk_bytes 2656 +#define MAYO_3_epk_bytes 237600 +#define MAYO_3_sig_bytes 577 +#define MAYO_3_f_tail F_TAIL_96 +#define MAYO_3_f_tail_arr f_tail_96 +#define MAYO_3_salt_bytes 32 +#define MAYO_3_digest_bytes 48 +#define MAYO_3_pk_seed_bytes 16 +#define MAYO_3_sk_seed_bytes 32 + +#define MAYO_5_name "MAYO_5" +#define MAYO_5_n 133 +#define MAYO_5_m 128 +#define MAYO_5_o 12 +#define MAYO_5_v (MAYO_5_n - MAYO_5_o) +#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1) +#define MAYO_5_k 12 +#define MAYO_5_q 16 +#define MAYO_5_m_bytes 64 +#define MAYO_5_O_bytes 726 +#define MAYO_5_v_bytes 61 +#define MAYO_5_r_bytes 72 +#define MAYO_5_P1_bytes 472384 +#define MAYO_5_P2_bytes 92928 +#define MAYO_5_P3_bytes 4992 +#define MAYO_5_csk_bytes 40 +#define MAYO_5_esk_bytes 566078 +#define MAYO_5_cpk_bytes 5008 +#define MAYO_5_epk_bytes 570304 +#define MAYO_5_sig_bytes 838 +#define MAYO_5_f_tail F_TAIL_128 +#define MAYO_5_f_tail_arr f_tail_128 +#define MAYO_5_salt_bytes 40 +#define MAYO_5_digest_bytes 64 +#define MAYO_5_pk_seed_bytes 16 +#define MAYO_5_sk_seed_bytes 40 + +#define PARAM_JOIN2_(a, b) a##_##b +#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b) +#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end) + +#if defined(MAYO_VARIANT) +#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c +#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c) +#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s) + +#if defined(MAYO_BUILD_TYPE_REF) +#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s) +#elif defined(MAYO_BUILD_TYPE_OPT) +#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s) +#elif defined(MAYO_BUILD_TYPE_AVX2) +#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s) +#else +#error "Build type not known" +#endif + +#else +#define MAYO_NAMESPACE(s) s +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define NAME_MAX mayo5 +#define N_MAX 133 +#define M_MAX 128 +#define O_MAX 18 +#define V_MAX 121 +#define K_MAX 12 +#define Q_MAX 16 +#define PK_SEED_BYTES_MAX 16 +#define SK_SEED_BYTES_MAX 40 +#define SALT_BYTES_MAX 40 +#define DIGEST_BYTES_MAX 64 +#define O_BYTES_MAX 726 +#define V_BYTES_MAX 61 +#define R_BYTES_MAX 72 +#define P1_BYTES_MAX 472384 +#define P2_BYTES_MAX 92928 +#define P3_BYTES_MAX 5472 +#define SIG_BYTES_MAX 838 +#define CPK_BYTES_MAX 5488 +#define CSK_BYTES_MAX 40 +#define ESK_BYTES_MAX 566078 +#define EPK_BYTES_MAX 570304 +#define N_MINUS_O_MAX 121 +#define M_BYTES_MAX 64 +#elif defined(MAYO_VARIANT) +#define M_MAX PARAM_NAME(m) +#define N_MAX PARAM_NAME(n) +#define O_MAX PARAM_NAME(o) +#define V_MAX PARAM_NAME(v) +#define K_MAX PARAM_NAME(k) +#define Q_MAX PARAM_NAME(q) +#define M_BYTES_MAX PARAM_NAME(m_bytes) +#define O_BYTES_MAX PARAM_NAME(O_bytes) +#define V_BYTES_MAX PARAM_NAME(v_bytes) +#define R_BYTES_MAX PARAM_NAME(r_bytes) +#define P1_BYTES_MAX PARAM_NAME(P1_bytes) +#define P2_BYTES_MAX PARAM_NAME(P2_bytes) +#define P3_BYTES_MAX PARAM_NAME(P3_bytes) +#define SIG_BYTES_MAX PARAM_NAME(sig_bytes) +#define CSK_BYTES_MAX PARAM_NAME(csk_bytes) +#define ESK_BYTES_MAX PARAM_NAME(esk_bytes) +#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes) +#define EPK_BYTES_MAX PARAM_NAME(epk_bytes) +#define N_MINUS_O_MAX (N_MAX - O_MAX) +#define SALT_BYTES_MAX PARAM_NAME(salt_bytes) +#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes) +#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes) +#define SK_SEED_BYTES_MAX SALT_BYTES_MAX +#else +#error "Parameter not specified" +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define PARAM_name(p) (p->name) +#define PARAM_m(p) (p->m) +#define PARAM_n(p) (p->n) +#define PARAM_o(p) (p->o) +#define PARAM_v(p) (p->n - p->o) +#define PARAM_A_cols(p) (p->k * p->o + 1) +#define PARAM_k(p) (p->k) +#define PARAM_q(p) (p->q) +#define PARAM_m_bytes(p) (p->m_bytes) +#define PARAM_O_bytes(p) (p->O_bytes) +#define PARAM_v_bytes(p) (p->v_bytes) +#define PARAM_r_bytes(p) (p->r_bytes) +#define PARAM_P1_bytes(p) (p->P1_bytes) +#define PARAM_P2_bytes(p) (p->P2_bytes) +#define PARAM_P3_bytes(p) (p->P3_bytes) +#define PARAM_csk_bytes(p) (p->csk_bytes) +#define PARAM_esk_bytes(p) (p->esk_bytes) +#define PARAM_cpk_bytes(p) (p->cpk_bytes) +#define PARAM_epk_bytes(p) (p->epk_bytes) +#define PARAM_sig_bytes(p) (p->sig_bytes) +#define PARAM_f_tail(p) (p->f_tail) +#define PARAM_salt_bytes(p) (p->salt_bytes) +#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes) +#define PARAM_digest_bytes(p) (p->digest_bytes) +#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes) +#elif defined(MAYO_VARIANT) +#define PARAM_name(p) PARAM_NAME(name) +#define PARAM_m(p) PARAM_NAME(m) +#define PARAM_n(p) PARAM_NAME(n) +#define PARAM_o(p) PARAM_NAME(o) +#define PARAM_v(p) PARAM_NAME(v) +#define PARAM_A_cols(p) PARAM_NAME(A_cols) +#define PARAM_k(p) PARAM_NAME(k) +#define PARAM_q(p) PARAM_NAME(q) +#define PARAM_m_bytes(p) PARAM_NAME(m_bytes) +#define PARAM_O_bytes(p) PARAM_NAME(O_bytes) +#define PARAM_v_bytes(p) PARAM_NAME(v_bytes) +#define PARAM_r_bytes(p) PARAM_NAME(r_bytes) +#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes) +#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes) +#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes) +#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes) +#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes) +#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes) +#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes) +#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes) +static const unsigned char f_tail[] = PARAM_NAME(f_tail); +#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes) +#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes) +#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes) +#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes) +#define PARAM_f_tail(p) f_tail +#else +#error "Parameter not specified" +#endif + +/** + * Struct defining MAYO parameters + */ +typedef struct { + int m; + int n; + int o; + int k; + int q; + const unsigned char *f_tail; + int m_bytes; + int O_bytes; + int v_bytes; + int r_bytes; + int R_bytes; + int P1_bytes; + int P2_bytes; + int P3_bytes; + int csk_bytes; + int esk_bytes; + int cpk_bytes; + int epk_bytes; + int sig_bytes; + int salt_bytes; + int sk_seed_bytes; + int digest_bytes; + int pk_seed_bytes; + const char *name; +} mayo_params_t; + +typedef struct sk_t { + uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8]; + uint8_t o[O_BYTES_MAX]; +} sk_t; + +/** + * MAYO parameter sets + */ +#ifdef ENABLE_PARAMS_DYNAMIC +extern const mayo_params_t MAYO_1; +extern const mayo_params_t MAYO_2; +extern const mayo_params_t MAYO_3; +extern const mayo_params_t MAYO_5; +#endif + +/** + * Status codes + */ +#define MAYO_OK 0 +#define MAYO_ERR 1 + +/** + * Mayo keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * @param[in] p Mayo parameter set + * @param[out] pk Mayo public key + * @param[out] sk Mayo secret key + * @return int status code + */ +#define mayo_keypair MAYO_NAMESPACE(mayo_keypair) +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk); + +#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature) +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk); + +/** + * MAYO signature generation. + * + * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec. + * Keys provided is a compacted secret keys. + * The caller is responsible to allocate sufficient memory to hold sm. + * + * @param[in] p Mayo parameter set + * @param[out] sm Signature concatenated with message + * @param[out] smlen Pointer to the length of sm + * @param[in] m Message to be signed + * @param[in] mlen Message length + * @param[in] sk Compacted secret key + * @return int status code + */ +#define mayo_sign MAYO_NAMESPACE(mayo_sign) +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +/** + * Mayo open signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m. + * Keys provided is a compact public key. + * The caller is responsible to allocate sufficient memory to hold m. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sm Signature concatenated with message + * @param[in] smlen Length of sm + * @param[in] pk Compacted public key + * @return int status code + */ +#define mayo_open MAYO_NAMESPACE(mayo_open) +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk); + +/** + * Mayo compact keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and + * cpk are compact representations of a Mayo secret key and public key + * + * @param[in] p Mayo parameter set + * @param[out] cpk Mayo compacted public key + * @param[out] csk Mayo compacted secret key + * @return int status code + */ +#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact) +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk); + +/** + * Mayo expand public key. + * + * The implementation corresponds to Mayo.expandPK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold epk. + * + * @param[in] p Mayo parameter set + * @param[in] cpk Compacted public key. + * @param[out] epk Expanded public key. + * @return int return code + */ +#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk) +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *epk); + +/** + * Mayo expand secret key. + * + * The implementation corresponds to Mayo.expandSK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold esk. + * + * @param[in] p Mayo parameter set + * @param[in] csk Compacted secret key. + * @param[out] esk Expanded secret key. + * @return int return code + */ +#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk) +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *esk); + +/** + * Mayo verify signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1. + * Keys provided is a compact public key. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sig Signature + * @param[in] pk Compacted public key + * @return int 0 if verification succeeded, 1 otherwise. + */ +#define mayo_verify MAYO_NAMESPACE(mayo_verify) +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *pk); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/mem.h b/src/sig/mayo/pqmayo_mayo-1_avx2/mem.h new file mode 100644 index 0000000000..2aa2196e2e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/mem.h @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MEM_H +#define MEM_H +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#define BSWAP32(i) __builtin_bswap32((i)) +#define BSWAP64(i) __builtin_bswap64((i)) +#else +#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24)) +#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)) +#endif + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint32_t ct_is_greater_than(int a, int b) { + int32_t diff = b - a; + return (uint32_t) (diff >> (8*sizeof(uint32_t)-1)); +} + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint64_t ct_64_is_greater_than(int a, int b) { + int64_t diff = ((int64_t) b) - ((int64_t) a); + return (uint64_t) (diff >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00000000, else 0xFFFFFFFF +static inline uint32_t ct_compare_32(int a, int b) { + return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF +static inline uint64_t ct_compare_64(int a, int b) { + return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00, else 0xFF +static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) { + return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +#include +/** + * Clears and frees allocated memory. + * + * @param[out] mem Memory to be cleared and freed. + * @param size Size of memory to be cleared and freed. + */ +static inline void mayo_secure_free(void *mem, size_t size) { + OQS_MEM_secure_free(mem, size); +} + +/** + * Clears memory. + * + * @param[out] mem Memory to be cleared. + * @param size Size of memory to be cleared. + */ +static inline void mayo_secure_clear(void *mem, size_t size) { + OQS_MEM_cleanse(mem, size); +} + +#endif diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/params.c b/src/sig/mayo/pqmayo_mayo-1_avx2/params.c new file mode 100644 index 0000000000..043d4cedc1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/params.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +static const unsigned char f_tail_64[] = F_TAIL_64; +static const unsigned char f_tail_96[] = F_TAIL_96; +static const unsigned char f_tail_128[] = F_TAIL_128; + +#define MAYO_GEN_PARAMS(nm) \ + const mayo_params_t nm = { \ + .m = PARAM_JOIN2(nm, m), \ + .n = PARAM_JOIN2(nm, n), \ + .o = PARAM_JOIN2(nm, o), \ + .k = PARAM_JOIN2(nm, k), \ + .q = PARAM_JOIN2(nm, q), \ + .f_tail = PARAM_JOIN2(nm, f_tail_arr), \ + .m_bytes = PARAM_JOIN2(nm, m_bytes), \ + .O_bytes = PARAM_JOIN2(nm, O_bytes), \ + .v_bytes = PARAM_JOIN2(nm, v_bytes), \ + .r_bytes = PARAM_JOIN2(nm, r_bytes), \ + .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \ + .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \ + .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \ + .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \ + .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \ + .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \ + .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \ + .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \ + .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \ + .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \ + .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \ + .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \ + .name = #nm \ + }; + +MAYO_GEN_PARAMS(MAYO_1); +MAYO_GEN_PARAMS(MAYO_2); +MAYO_GEN_PARAMS(MAYO_3); +MAYO_GEN_PARAMS(MAYO_5); +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_128.h new file mode 100644 index 0000000000..27b416adce --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_128.h @@ -0,0 +1,524 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_128_H +#define SHUFFLE_ARITHMETIC_128_H + +#include +#include +#include +#include + + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + cols_used ++; + __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[(2*r*O_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); + acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0; + acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1; + } + } +} + +static +inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){ + const __m256i *P1O_P2 = (__m256i *) _P1O_P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); + acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1; + } + } +} + + +static +inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + cols_used += 1; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + pos += (V_MAX -c - 1); + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k )); + __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1)); + __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2)); + __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3)); + + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k), acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1); + } + } +} + + +static +inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *L = (__m256i *) _L; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +static +inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + +static +inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *Pv = (__m256i *) _Pv; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + + // P1 * S1 + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + P1_cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1); + } + } + + // P2 * S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P3 = (__m256i *) _P3; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + for (c=r; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +static +inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc); +} + +static +inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){ + const __m256i *PS2 = (__m256i *) _PS2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_64.h new file mode 100644 index 0000000000..defff86f8f --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_64.h @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_64_H +#define SHUFFLE_ARITHMETIC_64_H + +#include +#include +#include +#include + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t; + } + } +} + + +static +inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1O_P2 = (__m256i *) _P1O_P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t; + } + } +} + +static +inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + cols_used += 1; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + pos); + pos += (V_MAX -c - 1); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k )); + __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1)); + + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + + _mm256_storeu_si256(acc + (r*O_MAX + k ), acc0 ^ temp[k ] ^ _mm256_slli_epi16(t,4)); + _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t); + } + } +} + + +static +inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *L = (__m256i *) _L; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +static +inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *Pv = (__m256i *) _Pv; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +static +inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular +// same as mayo_12_P1_times_Vt_avx2 +static +inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc); +} + +static +inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc); +} + +static +inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){ + const __m256i *PS2 = (__m256i *) _PS2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=0; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P2 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + + // P1 * S1 + for (c=r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + P1_cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even); + } + } + + // P2 * S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P3 = (__m256i *) _P3; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=r; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P3 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +static inline +void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { + (void) size; + int m_vecs_stored = 0; + + for (int r = 0; r < O_MAX; ++r) { + const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r)); + __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored); + _out[0] = _in[0]; + m_vecs_stored++; + for (int c = r + 1; c < O_MAX; ++c) { + const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c)); + const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r)); + _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored); + _out[0] = _in2[0] ^ _in3[0]; + m_vecs_stored++; + } + } +} + + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_96.h new file mode 100644 index 0000000000..9b3a69d567 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_96.h @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_96_H +#define SHUFFLE_ARITHMETIC_96_H + +#include +#include +#include +#include + + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + cols_used ++; + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + pos += (V_MAX -c - 1); + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + // P1 times S1 + __m256i temp[K_OVER_2*2*2] = {0}; + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + P1_cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1); + } + } + + // P2 times S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (c=r; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){ + mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc); +} + +static +inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_avx2/simple_arithmetic.h new file mode 100644 index 0000000000..ce7580f4c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_avx2/simple_arithmetic.h @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SIMPLE_ARITHMETIC_H +#define SIMPLE_ARITHMETIC_H + +// GF(16) multiplication mod x^4 + x + 1 +static inline unsigned char mul_f(unsigned char a, unsigned char b) { + // carryless multiply + unsigned char p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + unsigned char top_p = p & 0xf0; + unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f; + return out; +} + +static inline uint64_t mul_fx8(unsigned char a, uint64_t b) { + // carryless multiply + uint64_t p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0; + uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f; + return out; +} + +// GF(16) addition +static inline unsigned char add_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) subtraction +static inline unsigned char sub_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) negation +static inline unsigned char neg_f(unsigned char a) { + return a; +} + +static inline unsigned char inverse_f(unsigned char a) { + // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5, + // 10, 4, 3, 8}; return table[a & 15]; + + unsigned char a2 = mul_f(a, a); + unsigned char a4 = mul_f(a2, a2); + unsigned char a8 = mul_f(a4, a4); + unsigned char a6 = mul_f(a2, a4); + unsigned char a14 = mul_f(a8, a6); + + return a14; +} + +static inline unsigned char lincomb(const unsigned char *a, + const unsigned char *b, int n, int m) { + unsigned char ret = 0; + for (int i = 0; i < n; ++i, b += m) { + ret = add_f(mul_f(a[i], *b), ret); + } + return ret; +} + +static inline void mat_mul(const unsigned char *a, const unsigned char *b, + unsigned char *c, int colrow_ab, int row_a, int col_b) { + for (int i = 0; i < row_a; ++i, a += colrow_ab) { + for (int j = 0; j < col_b; ++j, ++c) { + *c = lincomb(a, b + j, colrow_ab, col_b); + } + } +} + +static inline void mat_add(const unsigned char *a, const unsigned char *b, + unsigned char *c, int m, int n) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j)); + } + } +} + +static +inline void m_vec_copy(int m_legs, const uint64_t *in, + uint64_t *out) { + for (int i = 0; i < m_legs * 2; i++) { + out[i] = in[i]; + } +} + +static +inline void m_vec_add(int m_legs, const uint64_t *in, + uint64_t *acc) { + for (int i = 0; i < m_legs * 2; i++) { + acc[i] ^= in[i]; + } +} + +// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1) +// gf16 := gf2[x]/(x^4+x+1) + +static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) { + uint32_t a_msb; + uint32_t a32 = a; + uint32_t b32 = b; + uint32_t r32 = a32 * (b32 & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 1) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 2) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 3) & 1); + + return r32; + +} + +static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < m_legs*2;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) { + for(int i=0;i +#include + +void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output); +#define AES_ECB_encrypt AES_256_ECB + +#ifdef ENABLE_AESNI +int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +#define AES_128_CTR AES_128_CTR_NI +#else +#include +static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen) { + (void) inputByteLen; + uint8_t iv[12] = { 0 }; + aes128ctr_prf(output, outputByteLen, input, iv); + return (int) outputByteLen; +} +#endif + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/api.c b/src/sig/mayo/pqmayo_mayo-1_opt/api.c new file mode 100644 index 0000000000..b7e2ef80ce --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/api.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +#define MAYO_PARAMS &MAYO_1 +#else +#define MAYO_PARAMS 0 +#endif + +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { + return mayo_keypair(MAYO_PARAMS, pk, sk); +} + +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk) { + return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk); +} + +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk) { + return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk); +} + +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk) { + return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk); +} + +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk) { + if (siglen != CRYPTO_BYTES) + return -1; + return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk); +} + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/api.h b/src/sig/mayo/pqmayo_mayo-1_opt/api.h new file mode 100644 index 0000000000..86b7bd545d --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/api.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef api_h +#define api_h + +#include + +#define CRYPTO_SECRETKEYBYTES 24 +#define CRYPTO_PUBLICKEYBYTES 1168 +#define CRYPTO_BYTES 321 + +#define CRYPTO_ALGNAME "MAYO-1" + +#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair) +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +#define crypto_sign MAYO_NAMESPACE(crypto_sign) +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk); + +#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature) +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open) +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk); + +#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify) +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk); + +#endif /* api_h */ + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.c new file mode 100644 index 0000000000..de09954c8a --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { +#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64) + mayo12_m_upper(m_legs, in, out, size); +#else + int m_vecs_stored = 0; + for (int r = 0; r < size; r++) { + for (int c = r; c < size; c++) { + m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored ); + if (r != c) { + m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored ); + } + m_vecs_stored ++; + } + } +#endif +} + +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){ +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + (void) p; + mayo_12_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + (void) p; + mayo_3_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + (void) p; + mayo_5_P1P1t_times_O(P1, O, acc); +#else + #ifndef MAYO_VARIANT + const int m_legs = PARAM_m(p) / 32; + #else + (void) p; + #endif + const int param_o = PARAM_o(p); + const int param_v = PARAM_n(p) - PARAM_o(p);; + + int + bs_mat_entries_used = 0; + for (int r = 0; r < param_v; r++) { + for (int c = r; c < param_v; c++) { + if(c==r) { + bs_mat_entries_used += 1; + continue; + } + for (int k = 0; k < param_o; k += 1) { + +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k)); + vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k)); + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k)); + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k)); +#else + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k)); + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k)); +#endif + + } + bs_mat_entries_used += 1; + } + } +#endif +} + +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_12_Vt_times_L_avx2(L, V_multabs, M); + mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_3_Vt_times_L_avx2(L, V_multabs, M); + mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_5_Vt_times_L_avx2(L, V_multabs, M); + mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mul_add_mat_x_m_mat(param_m / 32, V, L, M, + param_k, param_n - param_o, param_o); + + mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1); + + mul_add_mat_x_m_mat(param_m / 32, V, Pv, + Y, param_k, param_n - param_o, + param_k); +#endif +} + +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int m_legs = PARAM_m(p) / 32; + mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1); + mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o); +#endif +} + + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ] +// [ P3*S2 = P2 ] +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS) { + (void) m; +#if MAYO_AVX + const int n = o + v; + + /* Old approach which is constant time but doesn't have to be */ + unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K + unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + +#if M_MAX == 64 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 96 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 128 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS); + mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#else + NOT IMPLEMENTED +#endif +#else + const int n = o + v; + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS); + mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS); +#endif +} + + +// sample a solution x to Ax = y, with r used as randomness +// require: +// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are +// not important, they will be overwritten by y) in row major order +// - y is a vector with m elements +// - r and x are k*o bytes long +// return: 1 on success, 0 on failure +int sample_solution(const mayo_params_t *p, unsigned char *A, + const unsigned char *y, const unsigned char *r, + unsigned char *x, int k, int o, int m, int A_cols) { + #ifdef MAYO_VARIANT + (void) p; + #endif + unsigned char finished; + int col_upper_bound; + unsigned char correct_column; + + // x <- r + for (int i = 0; i < k * o; i++) { + x[i] = r[i]; + } + + // compute Ar; + unsigned char Ar[M_MAX]; + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = 0; // clear last col of A + } + mat_mul(A, r, Ar, k * o + 1, m, 1); + + // move y - Ar to last column of matrix A + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]); + } + + EF(A, m, k * o + 1); + + // check if last row of A (excluding the last entry of y) is zero + unsigned char full_rank = 0; + for (int i = 0; i < A_cols - 1; i++) { + full_rank |= A[(m - 1) * A_cols + i]; + } + +// It is okay to leak if we need to restart or not +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1); +#endif + + if (full_rank == 0) { + return 0; + } + + // back substitution in constant time + // the index of the first nonzero entry in each row is secret, which makes + // things less efficient + + for (int row = m - 1; row >= 0; row--) { + finished = 0; + col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o); + // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32} + + for (int col = row; col <= col_upper_bound; col++) { + + // Compare two chars in constant time. + // Returns 0x00 if the byte arrays are equal, 0xff otherwise. + correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished; + + unsigned char u = correct_column & A[row * A_cols + A_cols - 1]; + x[col] ^= u; + + for (int i = 0; i < row; i += 8) { + uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8) + ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24) + ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40) + ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56); + + tmp = mul_fx8(u, tmp); + + A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf; + A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf; + A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf; + A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf; + A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf; + A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf; + A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf; + A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf; + } + + finished = finished | correct_column; + } + } + return 1; +} + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.h new file mode 100644 index 0000000000..c2fb4fe334 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.h @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_H +#define ARITHMETIC_H + +#include +#include +#include + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define TARGET_BIG_ENDIAN +#endif + +#if defined(MAYO_AVX) && (M_MAX == 64) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 96) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 128) + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128)) + #include +#endif + +// Calculate P3 = O^T * (P1*O + P2) in KeyGen +#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2) +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3); + +// Calculate Upper in KeyGen +#define m_upper MAYO_NAMESPACE(m_upper) +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size); + +// Calculate acc = (P1+P1^T)*O in expand_sk +#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O) +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc); + +// Calculate M=V*L and Y=V*P1*V^T in Sign +#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt) +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y); + +// Sample solution in Sign +#define sample_solution MAYO_NAMESPACE(sample_solution) +int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols); + +// Calculate SPS = S*P*S^T in Verify +#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS) +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_128.h new file mode 100644 index 0000000000..418c308e2f --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_128.h @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_128_H +#define ARITHMETIC_128_H + +#include +#include +#include + +// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_128(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; +} + + +static +inline void vec_add_128(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; + acc[6] ^= in[6]; + acc[7] ^= in[7]; +} + +inline +static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 8;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 8;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 8;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void multiply_bins_128(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8); + m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8); + m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8); + m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8); + m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8); + m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8); + m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8); + m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8); + m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8); + m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8); + m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8); + m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8); + m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8); + m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8); + vec_copy_128(bins + 8, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_64.h new file mode 100644 index 0000000000..a70b7a3118 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_64.h @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_64_H +#define ARITHMETIC_64_H + +#include +#include + +// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_64(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; +} + +static +inline void vec_add_64(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; +} + +static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) { + uint64_t mask_msb = 0x8888888888888888ULL; + uint64_t a_msb; + uint64_t a64 = a; + uint64_t b32 = b; + uint64_t r64 = a64 * (b32 & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 1) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 2) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 3) & 1); + + return r64; +} + +static inline uint32_t mul_table(uint8_t b){ + uint32_t x = ((uint32_t) b) * 0x08040201; + + uint32_t high_nibble_mask = 0xf0f0f0f0; + + uint32_t high_half = x & high_nibble_mask; + return (x ^ (high_half >> 4) ^ (high_half >> 3)); +} + +static +inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 4;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < legs; i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +inline +static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 4;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 4;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void multiply_bins_64(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4); + m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4); + m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4); + m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4); + m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4); + m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4); + m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4); + m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4); + m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4); + m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4); + m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4); + m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4); + m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4); + m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4); + vec_copy_64(bins + 4, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_96.h new file mode 100644 index 0000000000..a38f89e454 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_96.h @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_96_H +#define ARITHMETIC_96_H + +#include +#include +#include + +// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_96(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; +} + +static +inline void vec_add_96(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; +} + +inline +static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 6;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 6;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 6;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void multiply_bins_96(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6); + m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6); + m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6); + m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6); + m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6); + m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6); + m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6); + m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6); + m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6); + m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6); + m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6); + m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6); + m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6); + m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6); + vec_copy_96(bins + 6, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_common.h new file mode 100644 index 0000000000..d337bc238c --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_common.h @@ -0,0 +1,469 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_COMMON_H +#define ARITHMETIC_COMMON_H + +#include + +#ifndef MAYO_VARIANT +static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) { + + m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2); + m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 3 * m_legs * 2); + + m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 6 * m_legs * 2); + + m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2); + m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 7 * m_legs * 2); + + m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 4 * m_legs * 2); + + m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 9 * m_legs * 2); + m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 3 * m_legs * 2); + + m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 2 * m_legs * 2); + m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2); + m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_copy(m_legs, bins + 1 * m_legs * 2, out); +} +#endif + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *PS) { + + const int n = o + v; +#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128) + (void)m; +#else + const int m_legs = m / 32; +#endif + + /* Old approach which is constant time but doesn't have to be + unsigned char S1[V_MAX*K_MAX]; + unsigned char S2[O_MAX*K_MAX]; + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1 + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2 + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2. + */ + + // use more stack efficient version for MAYO_3 and MAYO_5 + #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78 + uint64_t accumulator[M_MAX * N_MAX] = {0}; + int P1_used; + int P3_used; + for (int col = 0; col < k; col++) { + for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) { + accumulator[i] = 0; + } + P1_used = 0; + for (int row = 0; row < v; row++) { + for (int j = row; j < v; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8); +#else + bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 ); +#endif + P1_used ++; + } + + for (int j = 0; j < o; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 ); +#else + bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 ); +#endif + } + } + + P3_used = 0; + for (int row = v; row < n; row++) { + for (int j = row; j < n; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8); +#else + bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 ); +#endif + P3_used ++; + } + } + + for (int row = 0; row < n; row++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8); +#else + bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2); +#endif + } + } + + #else + + alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0}; + int P1_used = 0; + for (int row = 0; row < v; row++) { + for (int j = row; j < v; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 ); + } +#endif + P1_used ++; + } + + + for (int j = 0; j < o; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 ); + } +#endif + } + } + + int P3_used = 0; + for (int row = v; row < n; row++) { + for (int j = row; j < n; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 ); + } +#endif + P3_used ++; + } + } + + // multiply stuff according to the bins of the accumulator and add to PS. + int i = 0; + while (i < n * k) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8); + i++; +#else + m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2); + i++; +#endif + } + + #endif +} + + +static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int n, uint64_t *SPS){ + alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0}; + #if !defined(MAYO_VARIANT) + const int m_legs = m/32; + #else + (void) m; + #endif + for (int row = 0; row < k; row++) { + for (int j = 0; j < n; j++) { + for (int col = 0; col < k; col += 1) { + #if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 ); + #elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 ); + #elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 ); + #else + m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 ); + #endif + } + } + } + + // multiply stuff according to the bins of the accumulator and add to PS. + int i = 0; + while (i < k*k) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8); + i++; +#else + m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2); + i++; +#endif + } +} + + +// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc +static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) { + + int bs_mat_entries_used = 0; + for (int r = 0; r < bs_mat_rows; r++) { + for (int c = triangular * r; c < bs_mat_cols; c++) { + for (int k = 0; k < mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k)); +#endif + } + bs_mat_entries_used += 1; + } + } +} + +// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc +static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) { + + int bs_mat_entries_used = 0; + for (int r = 0; r < bs_mat_rows; r++) { + for (int c = triangular * r; c < bs_mat_cols; c++) { + for (int k = 0; k < mat_rows; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k)); +#endif + } + bs_mat_entries_used += 1; + } + } +} + +// multiplies the transpose of a single matrix with m matrices and adds result to acc +static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) { + + for (int r = 0; r < mat_cols; r++) { + for (int c = 0; c < mat_rows; c++) { + for (int k = 0; k < bs_mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k)); +#endif + } + } + } +} + +// multiplies a single matrix with m matrices and adds result to acc +static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) { + + for (int r = 0; r < mat_rows; r++) { + for (int c = 0; c < mat_cols; c++) { + for (int k = 0; k < bs_mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k)); +#endif + } + } + } +} +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo-1_opt/echelon_form.h new file mode 100644 index 0000000000..82505847c9 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/echelon_form.h @@ -0,0 +1,152 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ECHELON_FORM_H +#define ECHELON_FORM_H + +#include +#include +#include +#include + +#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) + +static inline unsigned char +m_extract_element(const uint64_t *in, int index) { + const int leg = index / 16; + const int offset = index % 16; + + return (in[leg] >> (offset*4)) & 0xF; +} + +static inline void +ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) { + int i; + unsigned char *out8 = (unsigned char *)out; + for(i = 0; i+1 < ncols; i += 2){ +#ifdef TARGET_BIG_ENDIAN + out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0) | (in[i+1] << 4); +#else + out8[i/2] = (in[i+0] << 0) | (in[i+1] << 4); +#endif + } + if (ncols % 2 == 1){ +#ifdef TARGET_BIG_ENDIAN + out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0); +#else + out8[i/2] = (in[i+0] << 0); +#endif + } +} + +static inline void +ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) { + const unsigned char *in8 = (const unsigned char *)in; + for(int i = 0; i < legs * 16; i += 2){ +#ifdef TARGET_BIG_ENDIAN + out[i] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF; + out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + out[i] = (in8[i/2]) & 0xF; + out[i+1] = (in8[i/2] >> 4); +#endif + } +} + + +// put matrix in row echelon form with ones on first nonzero entries *in +// constant time* +static inline void EF(unsigned char *A, int nrows, int ncols) { + + alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16]; + alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16]; + alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 }; + + int row_len = (ncols + 15) / 16; + + // nibbleslice the matrix A + for (int i = 0; i < nrows; i++) { + ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols); + } + + // pivot row is secret, pivot col is not + + unsigned char inverse; + int pivot_row = 0; + for (int pivot_col = 0; pivot_col < ncols; pivot_col++) { + + int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols); + int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col); + // the pivot row is guaranteed to be between these lower and upper bounds if + // A has full rank + + // zero out pivot row + for (int i = 0; i < row_len; i++) { + _pivot_row[i] = 0; + _pivot_row2[i] = 0; + } + + // try to get a pivot row in constant time + unsigned char pivot = 0; + uint64_t pivot_is_zero = -1; + for (int row = pivot_row_lower_bound; + row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) { + + uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row); + uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row); + + for (int j = 0; j < row_len; j++) { + _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) & + packed_A[row * row_len + j]; + } + pivot = m_extract_element(_pivot_row, pivot_col); + pivot_is_zero = ~ct_compare_64((int) pivot, 0); + } + + // multiply pivot row by inverse of pivot + inverse = inverse_f(pivot); + vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2); + + // conditionally write pivot row to the correct row, if there is a nonzero + // pivot + for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) { + uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero; + uint64_t do_not_copy = ~do_copy; + for (int col = 0; col < row_len; col++) { + packed_A[row * row_len + col] = + (do_not_copy & packed_A[row * row_len + col]) + + (do_copy & _pivot_row2[col]); + } + } + + // eliminate entries below pivot + for (int row = pivot_row_lower_bound; row < nrows; row++) { + unsigned char below_pivot = (row > pivot_row); + unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col); + + vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim, + packed_A + row * row_len); + } + + pivot_row += (-(int64_t)(~pivot_is_zero)); + } + + unsigned char temp[(O_MAX * K_MAX + 1 + 15)]; + + // unbitslice the matrix A + for (int i = 0; i < nrows; i++) { + ef_unpack_m_vec(row_len, packed_A + i * row_len, temp); + for (int j = 0; j < ncols; j++) { + A[i * ncols + j] = temp[j]; + } + } + + mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15); + mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8); + mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8); + mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/mayo.c b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.c new file mode 100644 index 0000000000..ce1ccd4c71 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define PK_PRF AES_128_CTR + +static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) { + int i; + for (i = 0; i < mdeclen / 2; ++i) { + *mdec++ = m[i] & 0xf; + *mdec++ = m[i] >> 4; + } + + if (mdeclen % 2 == 1) { + *mdec++ = m[i] & 0x0f; + } +} + +static void encode(const unsigned char *m, unsigned char *menc, int mlen) { + int i; + for (i = 0; i < mlen / 2; ++i, m += 2) { + menc[i] = (*m) | (*(m + 1) << 4); + } + + if (mlen % 2 == 1) { + menc[i] = (*m); + } +} + +static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *vPv = _vPv; + uint64_t temp[M_MAX/16] = {0}; + unsigned char *temp_bytes = (unsigned char *) temp; + int k = 0; + for (int i = PARAM_k(p) - 1; i >= 0 ; i--) { + for (int j = i; j < PARAM_k(p); j++) { + // multiply by X (shift up 4 bits) + unsigned char top = temp[k] >> 60; + temp[k] <<= 4; + k--; + for(; k>=0; k--){ + temp[k+1] ^= temp[k] >> 60; + temp[k] <<= 4; + } + // reduce mod f(X) + for (int jj = 0; jj < F_TAIL_LEN; jj++) { + if(jj%2 == 0){ +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#endif + } + else { +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#endif + } + } + + // extract from vPv and add + for(k=0; k < PARAM_m(p)/16; k ++){ + temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]); + } + k--; + } + } + + // add to y + for (int i = 0; i < PARAM_m(p); i+=2) + { +#ifdef TARGET_BIG_ENDIAN + y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + y[i] = t[i] ^ (temp_bytes[i/2] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4); +#endif + + } +} + +static void transpose_16x16_nibbles(uint64_t *M){ + static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f; + static const uint64_t even_bytes = 0x00ff00ff00ff00ff; + static const uint64_t even_2bytes = 0x0000ffff0000ffff; + static const uint64_t even_half = 0x00000000ffffffff; + + for (size_t i = 0; i < 16; i+=2) + { + uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; + M[i ] ^= t << 4; + M[i+1] ^= t; + } + + for (size_t i = 0; i < 16; i+=4) + { + uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes; + uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; + M[i ] ^= (t0 << 8); + M[i+1] ^= (t1 << 8); + M[i+2] ^= t0; + M[i+3] ^= t1; + } + + for (size_t i = 0; i < 4; i++) + { + uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes; + uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes; + + M[i ] ^= t0 << 16; + M[i+ 8] ^= t1 << 16; + M[i+ 4] ^= t0; + M[i+12] ^= t1; + } + + for (size_t i = 0; i < 8; i++) + { + uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; + M[i ] ^= t << 32; + M[i+8] ^= t; + } +} + +#define MAYO_M_OVER_8 ((M_MAX + 7) / 8) + +static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *VtL = _VtL; + int bits_to_shift = 0; + int words_to_shift = 0; + uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0}; + size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16; + const uint64_t *Mi, *Mj; + + for (int i = 0; i <= PARAM_k(p) - 1; ++i) { + for (int j = PARAM_k(p) - 1; j >= i; --j) { + // add the M_i and M_j to A, shifted "down" by l positions + Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + + if (i != j) { + Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + } + + bits_to_shift += 4; + if(bits_to_shift == 64){ + words_to_shift ++; + bits_to_shift = 0; + } + } + } + + for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16) + { + transpose_16x16_nibbles(A + c); + } + + unsigned char tab[F_TAIL_LEN*4] = {0}; + for (size_t i = 0; i < F_TAIL_LEN; i++) + { + tab[4*i] = mul_f(PARAM_f_tail(p)[i],1); + tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2); + tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4); + tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8); + } + + uint64_t low_bit_in_nibble = 0x1111111111111111; + + for (size_t c = 0; c < A_width; c+= 16) + { + for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++) + { + size_t pos = (r/16)*A_width + c + (r%16); + uint64_t t0 = A[pos] & low_bit_in_nibble; + uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble; + uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble; + uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble; + for (size_t t = 0; t < F_TAIL_LEN; t++) + { + A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3]; + } + } + } + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) + A[i] = BSWAP64(A[i]); +#endif + + for (int r = 0; r < PARAM_m(p); r+=16) + { + for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16) + { + for (size_t i = 0; i < 16; i++) + { + decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c)); + } + } + } +} + +// Public API + +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) { + int ret = 0; + + ret = mayo_keypair_compact(p, pk, sk); + if (ret != MAYO_OK) { + goto err; + } + +err: + return ret; +} + +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data + unsigned char y[M_MAX]; // secret data + unsigned char salt[SALT_BYTES_MAX]; // not secret data + unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX], + Vdec[N_MINUS_O_MAX * K_MAX]; // secret data + unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data + unsigned char x[K_MAX * N_MAX]; // not secret data + unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data + unsigned char s[K_MAX * N_MAX]; // not secret data + const unsigned char *seed_sk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data + alignas(32) sk_t sk; // secret data + unsigned char Ox[N_MINUS_O_MAX]; // secret data + // unsigned char Mdigest[DIGEST_BYTES]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1]; + unsigned char *ctrbyte; + unsigned char *vi; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_v_bytes = PARAM_v_bytes(p); + const int param_r_bytes = PARAM_r_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P2_bytes = PARAM_P2_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_A_cols = PARAM_A_cols(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + ret = mayo_expand_sk(p, csk, &sk); + if (ret != MAYO_OK) { + goto err; + } + + seed_sk = csk; + decode(sk.o, O, (param_n - param_o) * param_o); + + // hash message + shake256(tmp, param_digest_bytes, m, mlen); + + uint64_t *P1 = sk.p; + uint64_t *L = P1 + (param_P1_bytes/8); + alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0}; + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + L[i] = BSWAP64(L[i]); + } +#endif + + // choose the randomizer + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(tmp + param_digest_bytes, param_salt_bytes); + #else + if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // hashing to salt + memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk, + param_sk_seed_bytes); + shake256(salt, param_salt_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret +#endif + + // hashing to t + memcpy(tmp + param_digest_bytes, salt, param_salt_bytes); + ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes; + + shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + + decode(tenc, t, param_m); // may not be necessary + + for (int ctr = 0; ctr <= 255; ++ctr) { + *ctrbyte = (unsigned char)ctr; + + shake256(V, param_k * param_v_bytes + param_r_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1); + + // decode the v_i vectors + for (int i = 0; i <= param_k - 1; ++i) { + decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o), + param_n - param_o); + } + + // compute all the V * L matrices. + // compute all the V * P1 * V^T matrices. + alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0}; + V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y); + + compute_rhs(p, Y, t, y); + compute_A(p, Mtmp, A); + + decode(V + param_k * param_v_bytes, r, + param_k * + param_o); + if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) { + break; + } else { + memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t)); + } + } + + // s is already 0 + // TODO: optimize this? + for (int i = 0; i <= param_k - 1; ++i) { + vi = Vdec + i * (param_n - param_o); + mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1); + mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1); + memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o); + } + encode(s, sig, param_n * param_k); + memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes); + *siglen = param_sig_bytes; +err: + mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX); + mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX); + mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1)); + mayo_secure_clear(r, K_MAX * O_MAX + 1); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(&sk, sizeof(sk_t)); + mayo_secure_clear(Ox, N_MINUS_O_MAX); + mayo_secure_clear(tmp, + DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1); + return ret; +} + +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + const int param_sig_bytes = PARAM_sig_bytes(p); + size_t siglen = param_sig_bytes; + ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk); + if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes) + goto err; + + memmove(sm + param_sig_bytes, m, mlen); + *smlen = siglen + mlen; +err: + return ret; +} + +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk) { + const int param_sig_bytes = PARAM_sig_bytes(p); + if (smlen < (size_t)param_sig_bytes) { + return MAYO_ERR; + } + int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm, + pk); + + if (result == MAYO_OK) { + *mlen = smlen - param_sig_bytes; + memmove(m, sm + param_sig_bytes, *mlen); + } + + return result; +} + +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk) { + int ret = MAYO_OK; + unsigned char *seed_sk = csk; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8]; + alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0}; + + unsigned char *seed_pk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_m = PARAM_m(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + // seed_sk $←- B^(sk_seed bytes) + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(seed_sk, param_sk_seed_bytes); + #else + if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // S ← shake256(seedsk, pk seed bytes + O bytes) + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + // seed_pk ← s[0 : pk_seed_bytes] + seed_pk = S; + + // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes]) + decode(S + param_pk_seed_bytes, O, param_v * param_o); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + + int m_legs = param_m / 32; + + uint64_t *P1 = P; + uint64_t *P1O_P2 = P + (param_P1_bytes / 8); + + // compute P3 = O^t * (P1*O + P2) + Ot_times_P1O_P2(p, P1, O, P1O_P2, P3); + + // store seed_pk in cpk + memcpy(cpk, seed_pk, param_pk_seed_bytes); + + alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8]; + + // compute Upper(P3) and store in cpk + m_upper(m_legs, P3, P3_upper, param_o); + + memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes); + + +#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL) +err: +#endif + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2); + return ret; +} + +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *pk) { + #ifdef MAYO_VARIANT + (void)p; + #endif + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes); + pk += param_P1_bytes + param_P2_bytes; + memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes); + return MAYO_OK; +} + +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *sk) { + int ret = MAYO_OK; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + uint64_t *P = sk->p; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_o = PARAM_o(p); + const int param_v = PARAM_v(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + const unsigned char *seed_sk = csk; + unsigned char *seed_pk = S; + + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + decode(S + param_pk_seed_bytes, O, + param_v * param_o); // O = S + PK_SEED_BYTES; + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + uint64_t *P2 = P + (param_P1_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + uint64_t *P1 = P; + // compute L_i = (P1 + P1^t)*O + P2 + uint64_t *L = P2; + P1P1t_times_O(p, P1, O, L); + + // write to sk + memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + return ret; +} + +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *cpk) { + unsigned char tEnc[M_BYTES_MAX]; + unsigned char t[M_MAX]; + unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X) + unsigned char s[K_MAX * N_MAX]; + alignas (64) uint64_t pk[EPK_BYTES_MAX / 8]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX]; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P3_bytes = PARAM_P3_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk); + if (ret != MAYO_OK) { + return MAYO_ERR; + } + + uint64_t *P1 = pk; + uint64_t *P2 = pk + (param_P1_bytes / 8); + uint64_t *P3 = P2 + (param_P2_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + P2[i] = BSWAP64(P2[i]); + } + for (int i = 0; i < param_P3_bytes / 8; ++i) { + P3[i] = BSWAP64(P3[i]); + } +#endif + + // hash m + shake256(tmp, param_digest_bytes, m, mlen); + + // compute t + memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes, + param_salt_bytes); + shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + decode(tEnc, t, param_m); + + // decode s + decode(sig, s, param_k * param_n); + + // Compute S*P*S^T + alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0}; + + m_calculate_PS_SPS(P1, P2, P3, s, param_m, + param_v, param_o, param_k, SPS); + + // combine the vectors in SPS and reduce mod f(X) + compute_rhs(p, SPS, y, y); + + if (memcmp(y, t, param_m) == 0) { + return MAYO_OK; // good signature + } + return MAYO_ERR; // bad signature +} + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/mayo.h b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.h new file mode 100644 index 0000000000..1de4bf2a33 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.h @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MAYO_H +#define MAYO_H + +#include +#include + +#define F_TAIL_LEN 5 +#define F_TAIL_64 \ + { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3 +#define F_TAIL_96 \ + { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x +#define F_TAIL_128 \ + { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2 + +#define MAYO_1_name "MAYO_1" +#define MAYO_1_n 66 +#define MAYO_1_m 64 +#define MAYO_1_o 8 +#define MAYO_1_v (MAYO_1_n - MAYO_1_o) +#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1) +#define MAYO_1_k 9 +#define MAYO_1_q 16 +#define MAYO_1_m_bytes 32 +#define MAYO_1_O_bytes 232 +#define MAYO_1_v_bytes 29 +#define MAYO_1_r_bytes 36 +#define MAYO_1_P1_bytes 54752 +#define MAYO_1_P2_bytes 14848 +#define MAYO_1_P3_bytes 1152 +#define MAYO_1_csk_bytes 24 +#define MAYO_1_esk_bytes 69856 +#define MAYO_1_cpk_bytes 1168 +#define MAYO_1_epk_bytes 70752 +#define MAYO_1_sig_bytes 321 +#define MAYO_1_f_tail F_TAIL_64 +#define MAYO_1_f_tail_arr f_tail_64 +#define MAYO_1_salt_bytes 24 +#define MAYO_1_digest_bytes 32 +#define MAYO_1_pk_seed_bytes 16 +#define MAYO_1_sk_seed_bytes 24 + +#define MAYO_2_name "MAYO_2" +#define MAYO_2_n 78 +#define MAYO_2_m 64 +#define MAYO_2_o 18 +#define MAYO_2_v (MAYO_2_n - MAYO_2_o) +#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1) +#define MAYO_2_k 4 +#define MAYO_2_q 16 +#define MAYO_2_m_bytes 32 +#define MAYO_2_O_bytes 540 +#define MAYO_2_v_bytes 30 +#define MAYO_2_r_bytes 36 +#define MAYO_2_P1_bytes 58560 +#define MAYO_2_P2_bytes 34560 +#define MAYO_2_P3_bytes 5472 +#define MAYO_2_csk_bytes 24 +#define MAYO_2_esk_bytes 93684 +#define MAYO_2_cpk_bytes 5488 +#define MAYO_2_epk_bytes 98592 +#define MAYO_2_sig_bytes 180 +#define MAYO_2_f_tail F_TAIL_64 +#define MAYO_2_f_tail_arr f_tail_64 +#define MAYO_2_salt_bytes 24 +#define MAYO_2_digest_bytes 32 +#define MAYO_2_pk_seed_bytes 16 +#define MAYO_2_sk_seed_bytes 24 + +#define MAYO_3_name "MAYO_3" +#define MAYO_3_n 99 +#define MAYO_3_m 96 +#define MAYO_3_o 10 +#define MAYO_3_v (MAYO_3_n - MAYO_3_o) +#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1) +#define MAYO_3_k 11 +#define MAYO_3_q 16 +#define MAYO_3_m_bytes 48 +#define MAYO_3_O_bytes 445 +#define MAYO_3_v_bytes 45 +#define MAYO_3_r_bytes 55 +#define MAYO_3_P1_bytes 192240 +#define MAYO_3_P2_bytes 42720 +#define MAYO_3_P3_bytes 2640 +#define MAYO_3_csk_bytes 32 +#define MAYO_3_esk_bytes 235437 +#define MAYO_3_cpk_bytes 2656 +#define MAYO_3_epk_bytes 237600 +#define MAYO_3_sig_bytes 577 +#define MAYO_3_f_tail F_TAIL_96 +#define MAYO_3_f_tail_arr f_tail_96 +#define MAYO_3_salt_bytes 32 +#define MAYO_3_digest_bytes 48 +#define MAYO_3_pk_seed_bytes 16 +#define MAYO_3_sk_seed_bytes 32 + +#define MAYO_5_name "MAYO_5" +#define MAYO_5_n 133 +#define MAYO_5_m 128 +#define MAYO_5_o 12 +#define MAYO_5_v (MAYO_5_n - MAYO_5_o) +#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1) +#define MAYO_5_k 12 +#define MAYO_5_q 16 +#define MAYO_5_m_bytes 64 +#define MAYO_5_O_bytes 726 +#define MAYO_5_v_bytes 61 +#define MAYO_5_r_bytes 72 +#define MAYO_5_P1_bytes 472384 +#define MAYO_5_P2_bytes 92928 +#define MAYO_5_P3_bytes 4992 +#define MAYO_5_csk_bytes 40 +#define MAYO_5_esk_bytes 566078 +#define MAYO_5_cpk_bytes 5008 +#define MAYO_5_epk_bytes 570304 +#define MAYO_5_sig_bytes 838 +#define MAYO_5_f_tail F_TAIL_128 +#define MAYO_5_f_tail_arr f_tail_128 +#define MAYO_5_salt_bytes 40 +#define MAYO_5_digest_bytes 64 +#define MAYO_5_pk_seed_bytes 16 +#define MAYO_5_sk_seed_bytes 40 + +#define PARAM_JOIN2_(a, b) a##_##b +#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b) +#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end) + +#if defined(MAYO_VARIANT) +#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c +#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c) +#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s) + +#if defined(MAYO_BUILD_TYPE_REF) +#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s) +#elif defined(MAYO_BUILD_TYPE_OPT) +#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s) +#elif defined(MAYO_BUILD_TYPE_AVX2) +#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s) +#else +#error "Build type not known" +#endif + +#else +#define MAYO_NAMESPACE(s) s +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define NAME_MAX mayo5 +#define N_MAX 133 +#define M_MAX 128 +#define O_MAX 18 +#define V_MAX 121 +#define K_MAX 12 +#define Q_MAX 16 +#define PK_SEED_BYTES_MAX 16 +#define SK_SEED_BYTES_MAX 40 +#define SALT_BYTES_MAX 40 +#define DIGEST_BYTES_MAX 64 +#define O_BYTES_MAX 726 +#define V_BYTES_MAX 61 +#define R_BYTES_MAX 72 +#define P1_BYTES_MAX 472384 +#define P2_BYTES_MAX 92928 +#define P3_BYTES_MAX 5472 +#define SIG_BYTES_MAX 838 +#define CPK_BYTES_MAX 5488 +#define CSK_BYTES_MAX 40 +#define ESK_BYTES_MAX 566078 +#define EPK_BYTES_MAX 570304 +#define N_MINUS_O_MAX 121 +#define M_BYTES_MAX 64 +#elif defined(MAYO_VARIANT) +#define M_MAX PARAM_NAME(m) +#define N_MAX PARAM_NAME(n) +#define O_MAX PARAM_NAME(o) +#define V_MAX PARAM_NAME(v) +#define K_MAX PARAM_NAME(k) +#define Q_MAX PARAM_NAME(q) +#define M_BYTES_MAX PARAM_NAME(m_bytes) +#define O_BYTES_MAX PARAM_NAME(O_bytes) +#define V_BYTES_MAX PARAM_NAME(v_bytes) +#define R_BYTES_MAX PARAM_NAME(r_bytes) +#define P1_BYTES_MAX PARAM_NAME(P1_bytes) +#define P2_BYTES_MAX PARAM_NAME(P2_bytes) +#define P3_BYTES_MAX PARAM_NAME(P3_bytes) +#define SIG_BYTES_MAX PARAM_NAME(sig_bytes) +#define CSK_BYTES_MAX PARAM_NAME(csk_bytes) +#define ESK_BYTES_MAX PARAM_NAME(esk_bytes) +#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes) +#define EPK_BYTES_MAX PARAM_NAME(epk_bytes) +#define N_MINUS_O_MAX (N_MAX - O_MAX) +#define SALT_BYTES_MAX PARAM_NAME(salt_bytes) +#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes) +#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes) +#define SK_SEED_BYTES_MAX SALT_BYTES_MAX +#else +#error "Parameter not specified" +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define PARAM_name(p) (p->name) +#define PARAM_m(p) (p->m) +#define PARAM_n(p) (p->n) +#define PARAM_o(p) (p->o) +#define PARAM_v(p) (p->n - p->o) +#define PARAM_A_cols(p) (p->k * p->o + 1) +#define PARAM_k(p) (p->k) +#define PARAM_q(p) (p->q) +#define PARAM_m_bytes(p) (p->m_bytes) +#define PARAM_O_bytes(p) (p->O_bytes) +#define PARAM_v_bytes(p) (p->v_bytes) +#define PARAM_r_bytes(p) (p->r_bytes) +#define PARAM_P1_bytes(p) (p->P1_bytes) +#define PARAM_P2_bytes(p) (p->P2_bytes) +#define PARAM_P3_bytes(p) (p->P3_bytes) +#define PARAM_csk_bytes(p) (p->csk_bytes) +#define PARAM_esk_bytes(p) (p->esk_bytes) +#define PARAM_cpk_bytes(p) (p->cpk_bytes) +#define PARAM_epk_bytes(p) (p->epk_bytes) +#define PARAM_sig_bytes(p) (p->sig_bytes) +#define PARAM_f_tail(p) (p->f_tail) +#define PARAM_salt_bytes(p) (p->salt_bytes) +#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes) +#define PARAM_digest_bytes(p) (p->digest_bytes) +#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes) +#elif defined(MAYO_VARIANT) +#define PARAM_name(p) PARAM_NAME(name) +#define PARAM_m(p) PARAM_NAME(m) +#define PARAM_n(p) PARAM_NAME(n) +#define PARAM_o(p) PARAM_NAME(o) +#define PARAM_v(p) PARAM_NAME(v) +#define PARAM_A_cols(p) PARAM_NAME(A_cols) +#define PARAM_k(p) PARAM_NAME(k) +#define PARAM_q(p) PARAM_NAME(q) +#define PARAM_m_bytes(p) PARAM_NAME(m_bytes) +#define PARAM_O_bytes(p) PARAM_NAME(O_bytes) +#define PARAM_v_bytes(p) PARAM_NAME(v_bytes) +#define PARAM_r_bytes(p) PARAM_NAME(r_bytes) +#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes) +#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes) +#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes) +#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes) +#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes) +#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes) +#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes) +#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes) +static const unsigned char f_tail[] = PARAM_NAME(f_tail); +#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes) +#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes) +#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes) +#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes) +#define PARAM_f_tail(p) f_tail +#else +#error "Parameter not specified" +#endif + +/** + * Struct defining MAYO parameters + */ +typedef struct { + int m; + int n; + int o; + int k; + int q; + const unsigned char *f_tail; + int m_bytes; + int O_bytes; + int v_bytes; + int r_bytes; + int R_bytes; + int P1_bytes; + int P2_bytes; + int P3_bytes; + int csk_bytes; + int esk_bytes; + int cpk_bytes; + int epk_bytes; + int sig_bytes; + int salt_bytes; + int sk_seed_bytes; + int digest_bytes; + int pk_seed_bytes; + const char *name; +} mayo_params_t; + +typedef struct sk_t { + uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8]; + uint8_t o[O_BYTES_MAX]; +} sk_t; + +/** + * MAYO parameter sets + */ +#ifdef ENABLE_PARAMS_DYNAMIC +extern const mayo_params_t MAYO_1; +extern const mayo_params_t MAYO_2; +extern const mayo_params_t MAYO_3; +extern const mayo_params_t MAYO_5; +#endif + +/** + * Status codes + */ +#define MAYO_OK 0 +#define MAYO_ERR 1 + +/** + * Mayo keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * @param[in] p Mayo parameter set + * @param[out] pk Mayo public key + * @param[out] sk Mayo secret key + * @return int status code + */ +#define mayo_keypair MAYO_NAMESPACE(mayo_keypair) +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk); + +#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature) +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk); + +/** + * MAYO signature generation. + * + * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec. + * Keys provided is a compacted secret keys. + * The caller is responsible to allocate sufficient memory to hold sm. + * + * @param[in] p Mayo parameter set + * @param[out] sm Signature concatenated with message + * @param[out] smlen Pointer to the length of sm + * @param[in] m Message to be signed + * @param[in] mlen Message length + * @param[in] sk Compacted secret key + * @return int status code + */ +#define mayo_sign MAYO_NAMESPACE(mayo_sign) +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +/** + * Mayo open signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m. + * Keys provided is a compact public key. + * The caller is responsible to allocate sufficient memory to hold m. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sm Signature concatenated with message + * @param[in] smlen Length of sm + * @param[in] pk Compacted public key + * @return int status code + */ +#define mayo_open MAYO_NAMESPACE(mayo_open) +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk); + +/** + * Mayo compact keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and + * cpk are compact representations of a Mayo secret key and public key + * + * @param[in] p Mayo parameter set + * @param[out] cpk Mayo compacted public key + * @param[out] csk Mayo compacted secret key + * @return int status code + */ +#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact) +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk); + +/** + * Mayo expand public key. + * + * The implementation corresponds to Mayo.expandPK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold epk. + * + * @param[in] p Mayo parameter set + * @param[in] cpk Compacted public key. + * @param[out] epk Expanded public key. + * @return int return code + */ +#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk) +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *epk); + +/** + * Mayo expand secret key. + * + * The implementation corresponds to Mayo.expandSK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold esk. + * + * @param[in] p Mayo parameter set + * @param[in] csk Compacted secret key. + * @param[out] esk Expanded secret key. + * @return int return code + */ +#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk) +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *esk); + +/** + * Mayo verify signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1. + * Keys provided is a compact public key. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sig Signature + * @param[in] pk Compacted public key + * @return int 0 if verification succeeded, 1 otherwise. + */ +#define mayo_verify MAYO_NAMESPACE(mayo_verify) +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *pk); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/mem.h b/src/sig/mayo/pqmayo_mayo-1_opt/mem.h new file mode 100644 index 0000000000..2aa2196e2e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/mem.h @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MEM_H +#define MEM_H +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#define BSWAP32(i) __builtin_bswap32((i)) +#define BSWAP64(i) __builtin_bswap64((i)) +#else +#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24)) +#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)) +#endif + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint32_t ct_is_greater_than(int a, int b) { + int32_t diff = b - a; + return (uint32_t) (diff >> (8*sizeof(uint32_t)-1)); +} + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint64_t ct_64_is_greater_than(int a, int b) { + int64_t diff = ((int64_t) b) - ((int64_t) a); + return (uint64_t) (diff >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00000000, else 0xFFFFFFFF +static inline uint32_t ct_compare_32(int a, int b) { + return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF +static inline uint64_t ct_compare_64(int a, int b) { + return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00, else 0xFF +static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) { + return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +#include +/** + * Clears and frees allocated memory. + * + * @param[out] mem Memory to be cleared and freed. + * @param size Size of memory to be cleared and freed. + */ +static inline void mayo_secure_free(void *mem, size_t size) { + OQS_MEM_secure_free(mem, size); +} + +/** + * Clears memory. + * + * @param[out] mem Memory to be cleared. + * @param size Size of memory to be cleared. + */ +static inline void mayo_secure_clear(void *mem, size_t size) { + OQS_MEM_cleanse(mem, size); +} + +#endif diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/params.c b/src/sig/mayo/pqmayo_mayo-1_opt/params.c new file mode 100644 index 0000000000..043d4cedc1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/params.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +static const unsigned char f_tail_64[] = F_TAIL_64; +static const unsigned char f_tail_96[] = F_TAIL_96; +static const unsigned char f_tail_128[] = F_TAIL_128; + +#define MAYO_GEN_PARAMS(nm) \ + const mayo_params_t nm = { \ + .m = PARAM_JOIN2(nm, m), \ + .n = PARAM_JOIN2(nm, n), \ + .o = PARAM_JOIN2(nm, o), \ + .k = PARAM_JOIN2(nm, k), \ + .q = PARAM_JOIN2(nm, q), \ + .f_tail = PARAM_JOIN2(nm, f_tail_arr), \ + .m_bytes = PARAM_JOIN2(nm, m_bytes), \ + .O_bytes = PARAM_JOIN2(nm, O_bytes), \ + .v_bytes = PARAM_JOIN2(nm, v_bytes), \ + .r_bytes = PARAM_JOIN2(nm, r_bytes), \ + .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \ + .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \ + .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \ + .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \ + .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \ + .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \ + .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \ + .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \ + .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \ + .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \ + .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \ + .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \ + .name = #nm \ + }; + +MAYO_GEN_PARAMS(MAYO_1); +MAYO_GEN_PARAMS(MAYO_2); +MAYO_GEN_PARAMS(MAYO_3); +MAYO_GEN_PARAMS(MAYO_5); +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_opt/simple_arithmetic.h new file mode 100644 index 0000000000..ce7580f4c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-1_opt/simple_arithmetic.h @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SIMPLE_ARITHMETIC_H +#define SIMPLE_ARITHMETIC_H + +// GF(16) multiplication mod x^4 + x + 1 +static inline unsigned char mul_f(unsigned char a, unsigned char b) { + // carryless multiply + unsigned char p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + unsigned char top_p = p & 0xf0; + unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f; + return out; +} + +static inline uint64_t mul_fx8(unsigned char a, uint64_t b) { + // carryless multiply + uint64_t p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0; + uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f; + return out; +} + +// GF(16) addition +static inline unsigned char add_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) subtraction +static inline unsigned char sub_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) negation +static inline unsigned char neg_f(unsigned char a) { + return a; +} + +static inline unsigned char inverse_f(unsigned char a) { + // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5, + // 10, 4, 3, 8}; return table[a & 15]; + + unsigned char a2 = mul_f(a, a); + unsigned char a4 = mul_f(a2, a2); + unsigned char a8 = mul_f(a4, a4); + unsigned char a6 = mul_f(a2, a4); + unsigned char a14 = mul_f(a8, a6); + + return a14; +} + +static inline unsigned char lincomb(const unsigned char *a, + const unsigned char *b, int n, int m) { + unsigned char ret = 0; + for (int i = 0; i < n; ++i, b += m) { + ret = add_f(mul_f(a[i], *b), ret); + } + return ret; +} + +static inline void mat_mul(const unsigned char *a, const unsigned char *b, + unsigned char *c, int colrow_ab, int row_a, int col_b) { + for (int i = 0; i < row_a; ++i, a += colrow_ab) { + for (int j = 0; j < col_b; ++j, ++c) { + *c = lincomb(a, b + j, colrow_ab, col_b); + } + } +} + +static inline void mat_add(const unsigned char *a, const unsigned char *b, + unsigned char *c, int m, int n) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j)); + } + } +} + +static +inline void m_vec_copy(int m_legs, const uint64_t *in, + uint64_t *out) { + for (int i = 0; i < m_legs * 2; i++) { + out[i] = in[i]; + } +} + +static +inline void m_vec_add(int m_legs, const uint64_t *in, + uint64_t *acc) { + for (int i = 0; i < m_legs * 2; i++) { + acc[i] ^= in[i]; + } +} + +// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1) +// gf16 := gf2[x]/(x^4+x+1) + +static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) { + uint32_t a_msb; + uint32_t a32 = a; + uint32_t b32 = b; + uint32_t r32 = a32 * (b32 & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 1) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 2) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 3) & 1); + + return r32; + +} + +static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < m_legs*2;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) { + for(int i=0;i +#include + +void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output); +#define AES_ECB_encrypt AES_256_ECB + +#ifdef ENABLE_AESNI +int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +#define AES_128_CTR AES_128_CTR_NI +#else +#include +static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen) { + (void) inputByteLen; + uint8_t iv[12] = { 0 }; + aes128ctr_prf(output, outputByteLen, input, iv); + return (int) outputByteLen; +} +#endif + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/api.c b/src/sig/mayo/pqmayo_mayo-2_avx2/api.c new file mode 100644 index 0000000000..a7cf85eedf --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/api.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +#define MAYO_PARAMS &MAYO_2 +#else +#define MAYO_PARAMS 0 +#endif + +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { + return mayo_keypair(MAYO_PARAMS, pk, sk); +} + +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk) { + return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk); +} + +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk) { + return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk); +} + +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk) { + return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk); +} + +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk) { + if (siglen != CRYPTO_BYTES) + return -1; + return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk); +} + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/api.h b/src/sig/mayo/pqmayo_mayo-2_avx2/api.h new file mode 100644 index 0000000000..265a5639db --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/api.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef api_h +#define api_h + +#include + +#define CRYPTO_SECRETKEYBYTES 24 +#define CRYPTO_PUBLICKEYBYTES 5488 +#define CRYPTO_BYTES 180 + +#define CRYPTO_ALGNAME "MAYO-2" + +#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair) +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +#define crypto_sign MAYO_NAMESPACE(crypto_sign) +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk); + +#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature) +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open) +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk); + +#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify) +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk); + +#endif /* api_h */ + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.c new file mode 100644 index 0000000000..de09954c8a --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { +#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64) + mayo12_m_upper(m_legs, in, out, size); +#else + int m_vecs_stored = 0; + for (int r = 0; r < size; r++) { + for (int c = r; c < size; c++) { + m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored ); + if (r != c) { + m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored ); + } + m_vecs_stored ++; + } + } +#endif +} + +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){ +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + (void) p; + mayo_12_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + (void) p; + mayo_3_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + (void) p; + mayo_5_P1P1t_times_O(P1, O, acc); +#else + #ifndef MAYO_VARIANT + const int m_legs = PARAM_m(p) / 32; + #else + (void) p; + #endif + const int param_o = PARAM_o(p); + const int param_v = PARAM_n(p) - PARAM_o(p);; + + int + bs_mat_entries_used = 0; + for (int r = 0; r < param_v; r++) { + for (int c = r; c < param_v; c++) { + if(c==r) { + bs_mat_entries_used += 1; + continue; + } + for (int k = 0; k < param_o; k += 1) { + +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k)); + vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k)); + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k)); + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k)); +#else + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k)); + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k)); +#endif + + } + bs_mat_entries_used += 1; + } + } +#endif +} + +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_12_Vt_times_L_avx2(L, V_multabs, M); + mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_3_Vt_times_L_avx2(L, V_multabs, M); + mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_5_Vt_times_L_avx2(L, V_multabs, M); + mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mul_add_mat_x_m_mat(param_m / 32, V, L, M, + param_k, param_n - param_o, param_o); + + mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1); + + mul_add_mat_x_m_mat(param_m / 32, V, Pv, + Y, param_k, param_n - param_o, + param_k); +#endif +} + +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int m_legs = PARAM_m(p) / 32; + mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1); + mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o); +#endif +} + + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ] +// [ P3*S2 = P2 ] +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS) { + (void) m; +#if MAYO_AVX + const int n = o + v; + + /* Old approach which is constant time but doesn't have to be */ + unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K + unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + +#if M_MAX == 64 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 96 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 128 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS); + mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#else + NOT IMPLEMENTED +#endif +#else + const int n = o + v; + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS); + mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS); +#endif +} + + +// sample a solution x to Ax = y, with r used as randomness +// require: +// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are +// not important, they will be overwritten by y) in row major order +// - y is a vector with m elements +// - r and x are k*o bytes long +// return: 1 on success, 0 on failure +int sample_solution(const mayo_params_t *p, unsigned char *A, + const unsigned char *y, const unsigned char *r, + unsigned char *x, int k, int o, int m, int A_cols) { + #ifdef MAYO_VARIANT + (void) p; + #endif + unsigned char finished; + int col_upper_bound; + unsigned char correct_column; + + // x <- r + for (int i = 0; i < k * o; i++) { + x[i] = r[i]; + } + + // compute Ar; + unsigned char Ar[M_MAX]; + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = 0; // clear last col of A + } + mat_mul(A, r, Ar, k * o + 1, m, 1); + + // move y - Ar to last column of matrix A + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]); + } + + EF(A, m, k * o + 1); + + // check if last row of A (excluding the last entry of y) is zero + unsigned char full_rank = 0; + for (int i = 0; i < A_cols - 1; i++) { + full_rank |= A[(m - 1) * A_cols + i]; + } + +// It is okay to leak if we need to restart or not +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1); +#endif + + if (full_rank == 0) { + return 0; + } + + // back substitution in constant time + // the index of the first nonzero entry in each row is secret, which makes + // things less efficient + + for (int row = m - 1; row >= 0; row--) { + finished = 0; + col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o); + // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32} + + for (int col = row; col <= col_upper_bound; col++) { + + // Compare two chars in constant time. + // Returns 0x00 if the byte arrays are equal, 0xff otherwise. + correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished; + + unsigned char u = correct_column & A[row * A_cols + A_cols - 1]; + x[col] ^= u; + + for (int i = 0; i < row; i += 8) { + uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8) + ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24) + ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40) + ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56); + + tmp = mul_fx8(u, tmp); + + A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf; + A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf; + A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf; + A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf; + A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf; + A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf; + A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf; + A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf; + } + + finished = finished | correct_column; + } + } + return 1; +} + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.h new file mode 100644 index 0000000000..c2fb4fe334 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.h @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_H +#define ARITHMETIC_H + +#include +#include +#include + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define TARGET_BIG_ENDIAN +#endif + +#if defined(MAYO_AVX) && (M_MAX == 64) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 96) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 128) + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128)) + #include +#endif + +// Calculate P3 = O^T * (P1*O + P2) in KeyGen +#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2) +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3); + +// Calculate Upper in KeyGen +#define m_upper MAYO_NAMESPACE(m_upper) +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size); + +// Calculate acc = (P1+P1^T)*O in expand_sk +#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O) +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc); + +// Calculate M=V*L and Y=V*P1*V^T in Sign +#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt) +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y); + +// Sample solution in Sign +#define sample_solution MAYO_NAMESPACE(sample_solution) +int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols); + +// Calculate SPS = S*P*S^T in Verify +#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS) +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_128.h new file mode 100644 index 0000000000..27b367e940 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_128.h @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_128_H +#define ARITHMETIC_128_H + +#include +#include +#include + +// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_128(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; +} + + +static +inline void vec_add_128(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; + acc[6] ^= in[6]; + acc[7] ^= in[7]; +} + +inline +static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} +inline +static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static + inline void multiply_bins_128(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8); + m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8); + m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8); + m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8); + m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8); + m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8); + m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8); + m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8); + m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8); + m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8); + m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8); + m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8); + m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8); + m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8); + vec_copy_128(bins + 8, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_64.h new file mode 100644 index 0000000000..9f7535c878 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_64.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_64_H +#define ARITHMETIC_64_H + +#include +#include +#include + +// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_64(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; +} + +static +inline void vec_add_64(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; +} + +static +inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +inline +static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} +inline +static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void multiply_bins_64(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4); + m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4); + m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4); + m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4); + m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4); + m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4); + m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4); + m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4); + m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4); + m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4); + m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4); + m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4); + m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4); + m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4); + vec_copy_64(bins + 4, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_96.h new file mode 100644 index 0000000000..86359679fb --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_96.h @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_96_H +#define ARITHMETIC_96_H + +#include +#include +#include + +// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_96(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; +} + +static +inline void vec_add_96(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; +} + +inline +static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} + +inline +static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static +inline void multiply_bins_96(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6); + m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6); + m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6); + m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6); + m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6); + m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6); + m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6); + m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6); + m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6); + m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6); + m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6); + m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6); + m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6); + m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6); + vec_copy_96(bins + 6, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_common.h new file mode 100644 index 0000000000..eeb13dc0bd --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_common.h @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_COMMON_H +#define ARITHMETIC_COMMON_H + +#include +#include + +#define K_OVER_2 ((K_MAX+1)/2) + +static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, + 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, + 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01 +}; + +// +// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper! +// +static inline __m256i tbl32_gf16_multab2( uint8_t b ) { + + __m256i bx = _mm256_set1_epi16( b & 0xf ); + __m256i b1 = _mm256_srli_epi16( bx, 1 ); + + const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0)); + const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1)); + const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2)); + const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3)); + + __m256i mask_1 = _mm256_set1_epi16(1); + __m256i mask_4 = _mm256_set1_epi16(4); + __m256i mask_0 = _mm256_setzero_si256(); + + return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) ) + ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) ) + ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) ) + ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) ); +} + +static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) { + return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f); +} + +static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) { + __m256i multab_l = tbl32_gf16_multab2( b ); + __m256i multab_h = _mm256_slli_epi16( multab_l, 4 ); + + return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) ); +} + +static +inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){ + // build multiplication tables + for (size_t r = 0; r < V_MAX; r++) + { + for (size_t c = 0; c < O_MAX; c+=2) + { + O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4); + } + } +} + + +static +inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){ + // build multiplication tables + size_t r; + for (size_t c = 0; c < V_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4); + } +#if K_MAX % 2 == 1 + V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]); +#endif + } +} + +static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = { + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, + 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, + 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, + 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, + 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, + 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, + 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, + 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, + 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, + 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, + 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, + 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, + 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, + 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, + 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, + 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, + 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, + 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, + 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, + 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, + 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, + 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, + 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, + 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, + 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, + 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, + 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a, + 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a}; + + +static +inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) { + size_t r; + for (size_t c = 0; c < V_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])) + ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4); + } +#if K_MAX % 2 == 1 + S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])); +#endif + } +} + +static +inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) { + // build multiplication tables + size_t r; + for (size_t c = 0; c < O_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) + ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4); + } +#if K_MAX % 2 == 1 + S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ; +#endif + } +} + +static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) { + uint64_t mask_msb = 0x8888888888888888ULL; + uint64_t a_msb; + uint64_t a64 = a; + uint64_t b32 = b; + uint64_t r64 = a64 * (b32 & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 1) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 2) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 3) & 1); + + return r64; +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form.h new file mode 100644 index 0000000000..fa69de0ab2 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form.h @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + + +#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) + + +// +// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/. +// +static inline __m256i tbl32_gf16_multab( uint8_t b ) { + __m256i bx = _mm256_set1_epi16( b & 0xf ); + __m256i b1 = _mm256_srli_epi16( bx, 1 ); + + const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0)); + const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1)); + const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2)); + const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3)); + + __m256i mask_1 = _mm256_set1_epi16(1); + __m256i mask_4 = _mm256_set1_epi16(4); + __m256i mask_0 = _mm256_setzero_si256(); + + return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) ) + ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) ) + ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) ) + ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) ); +} + +/* put matrix in row echelon form with ones on first nonzero entries in constant time*/ +static inline void EF(unsigned char *A, int _nrows, int _ncols) { + + (void) _nrows; + (void) _ncols; + + #define nrows M_MAX + #define ncols (K_MAX * O_MAX + 1) + + #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32) + #define MAX_COLS (AVX_REGS_PER_ROW * 32) + + __m256i _pivot_row[AVX_REGS_PER_ROW]; + __m256i A_avx[AVX_REGS_PER_ROW* M_MAX]; + + unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row; + unsigned char* A_bytes = (unsigned char*) A_avx; + + // load A in the tail of AVX2 registers + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) + { + A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ]; + } + } + + // pivot row is secret, pivot col is not + unsigned char inverse; + int pivot_row = 0; + int pivot_col = MAYO_MAX(MAX_COLS - ncols,0); + for (; pivot_col < MAX_COLS-128; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-96; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-64; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-32; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS; pivot_col++) { + #include "echelon_form_loop.h" + } + + // write the matrix A back + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) { + A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j]; + } + } + mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32); + mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows); +} + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form_loop.h new file mode 100644 index 0000000000..b8b29741c4 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form_loop.h @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 + +int pivot_col_rounded = pivot_col/32; + +int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS); +int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols); +/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/ + +/* zero out pivot row */ +for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) { + _pivot_row[i] = _mm256_set1_epi8(0); +} + +/* try to get a pivot row in constant time */ +unsigned char pivot = 0; +uint32_t pivot_is_zero = -1; +for (int row = pivot_row_lower_bound; + row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) { + uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row); + uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row); + __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) ); + for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j]; + } + pivot = pivot_row_bytes[pivot_col]; + pivot_is_zero = ~ct_compare_32((int) pivot, 0); +} + +/* multiply pivot row by inverse of pivot */ +inverse = inverse_f(pivot); +__m256i inverse_multab = tbl32_gf16_multab(inverse); + +for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]); +} + +/* conditionally write pivot row to the correct row, if there is a nonzero pivot */ +/* eliminate entries below pivot */ +for (int row = pivot_row_lower_bound; row < nrows; row++) { + unsigned char below_pivot = (unsigned char) (ct_is_greater_than(row, pivot_row)); + unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col]; + + __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim); + if (row <= pivot_row_upper_bound) { + __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero); + for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) { + A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^ + _mm256_shuffle_epi8(multab, _pivot_row[col]); + } + } else { + for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]); + } + } +} + +pivot_row += (-(int32_t)(~pivot_is_zero)); + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.c new file mode 100644 index 0000000000..ce1ccd4c71 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define PK_PRF AES_128_CTR + +static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) { + int i; + for (i = 0; i < mdeclen / 2; ++i) { + *mdec++ = m[i] & 0xf; + *mdec++ = m[i] >> 4; + } + + if (mdeclen % 2 == 1) { + *mdec++ = m[i] & 0x0f; + } +} + +static void encode(const unsigned char *m, unsigned char *menc, int mlen) { + int i; + for (i = 0; i < mlen / 2; ++i, m += 2) { + menc[i] = (*m) | (*(m + 1) << 4); + } + + if (mlen % 2 == 1) { + menc[i] = (*m); + } +} + +static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *vPv = _vPv; + uint64_t temp[M_MAX/16] = {0}; + unsigned char *temp_bytes = (unsigned char *) temp; + int k = 0; + for (int i = PARAM_k(p) - 1; i >= 0 ; i--) { + for (int j = i; j < PARAM_k(p); j++) { + // multiply by X (shift up 4 bits) + unsigned char top = temp[k] >> 60; + temp[k] <<= 4; + k--; + for(; k>=0; k--){ + temp[k+1] ^= temp[k] >> 60; + temp[k] <<= 4; + } + // reduce mod f(X) + for (int jj = 0; jj < F_TAIL_LEN; jj++) { + if(jj%2 == 0){ +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#endif + } + else { +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#endif + } + } + + // extract from vPv and add + for(k=0; k < PARAM_m(p)/16; k ++){ + temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]); + } + k--; + } + } + + // add to y + for (int i = 0; i < PARAM_m(p); i+=2) + { +#ifdef TARGET_BIG_ENDIAN + y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + y[i] = t[i] ^ (temp_bytes[i/2] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4); +#endif + + } +} + +static void transpose_16x16_nibbles(uint64_t *M){ + static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f; + static const uint64_t even_bytes = 0x00ff00ff00ff00ff; + static const uint64_t even_2bytes = 0x0000ffff0000ffff; + static const uint64_t even_half = 0x00000000ffffffff; + + for (size_t i = 0; i < 16; i+=2) + { + uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; + M[i ] ^= t << 4; + M[i+1] ^= t; + } + + for (size_t i = 0; i < 16; i+=4) + { + uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes; + uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; + M[i ] ^= (t0 << 8); + M[i+1] ^= (t1 << 8); + M[i+2] ^= t0; + M[i+3] ^= t1; + } + + for (size_t i = 0; i < 4; i++) + { + uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes; + uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes; + + M[i ] ^= t0 << 16; + M[i+ 8] ^= t1 << 16; + M[i+ 4] ^= t0; + M[i+12] ^= t1; + } + + for (size_t i = 0; i < 8; i++) + { + uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; + M[i ] ^= t << 32; + M[i+8] ^= t; + } +} + +#define MAYO_M_OVER_8 ((M_MAX + 7) / 8) + +static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *VtL = _VtL; + int bits_to_shift = 0; + int words_to_shift = 0; + uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0}; + size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16; + const uint64_t *Mi, *Mj; + + for (int i = 0; i <= PARAM_k(p) - 1; ++i) { + for (int j = PARAM_k(p) - 1; j >= i; --j) { + // add the M_i and M_j to A, shifted "down" by l positions + Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + + if (i != j) { + Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + } + + bits_to_shift += 4; + if(bits_to_shift == 64){ + words_to_shift ++; + bits_to_shift = 0; + } + } + } + + for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16) + { + transpose_16x16_nibbles(A + c); + } + + unsigned char tab[F_TAIL_LEN*4] = {0}; + for (size_t i = 0; i < F_TAIL_LEN; i++) + { + tab[4*i] = mul_f(PARAM_f_tail(p)[i],1); + tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2); + tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4); + tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8); + } + + uint64_t low_bit_in_nibble = 0x1111111111111111; + + for (size_t c = 0; c < A_width; c+= 16) + { + for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++) + { + size_t pos = (r/16)*A_width + c + (r%16); + uint64_t t0 = A[pos] & low_bit_in_nibble; + uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble; + uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble; + uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble; + for (size_t t = 0; t < F_TAIL_LEN; t++) + { + A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3]; + } + } + } + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) + A[i] = BSWAP64(A[i]); +#endif + + for (int r = 0; r < PARAM_m(p); r+=16) + { + for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16) + { + for (size_t i = 0; i < 16; i++) + { + decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c)); + } + } + } +} + +// Public API + +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) { + int ret = 0; + + ret = mayo_keypair_compact(p, pk, sk); + if (ret != MAYO_OK) { + goto err; + } + +err: + return ret; +} + +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data + unsigned char y[M_MAX]; // secret data + unsigned char salt[SALT_BYTES_MAX]; // not secret data + unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX], + Vdec[N_MINUS_O_MAX * K_MAX]; // secret data + unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data + unsigned char x[K_MAX * N_MAX]; // not secret data + unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data + unsigned char s[K_MAX * N_MAX]; // not secret data + const unsigned char *seed_sk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data + alignas(32) sk_t sk; // secret data + unsigned char Ox[N_MINUS_O_MAX]; // secret data + // unsigned char Mdigest[DIGEST_BYTES]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1]; + unsigned char *ctrbyte; + unsigned char *vi; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_v_bytes = PARAM_v_bytes(p); + const int param_r_bytes = PARAM_r_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P2_bytes = PARAM_P2_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_A_cols = PARAM_A_cols(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + ret = mayo_expand_sk(p, csk, &sk); + if (ret != MAYO_OK) { + goto err; + } + + seed_sk = csk; + decode(sk.o, O, (param_n - param_o) * param_o); + + // hash message + shake256(tmp, param_digest_bytes, m, mlen); + + uint64_t *P1 = sk.p; + uint64_t *L = P1 + (param_P1_bytes/8); + alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0}; + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + L[i] = BSWAP64(L[i]); + } +#endif + + // choose the randomizer + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(tmp + param_digest_bytes, param_salt_bytes); + #else + if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // hashing to salt + memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk, + param_sk_seed_bytes); + shake256(salt, param_salt_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret +#endif + + // hashing to t + memcpy(tmp + param_digest_bytes, salt, param_salt_bytes); + ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes; + + shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + + decode(tenc, t, param_m); // may not be necessary + + for (int ctr = 0; ctr <= 255; ++ctr) { + *ctrbyte = (unsigned char)ctr; + + shake256(V, param_k * param_v_bytes + param_r_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1); + + // decode the v_i vectors + for (int i = 0; i <= param_k - 1; ++i) { + decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o), + param_n - param_o); + } + + // compute all the V * L matrices. + // compute all the V * P1 * V^T matrices. + alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0}; + V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y); + + compute_rhs(p, Y, t, y); + compute_A(p, Mtmp, A); + + decode(V + param_k * param_v_bytes, r, + param_k * + param_o); + if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) { + break; + } else { + memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t)); + } + } + + // s is already 0 + // TODO: optimize this? + for (int i = 0; i <= param_k - 1; ++i) { + vi = Vdec + i * (param_n - param_o); + mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1); + mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1); + memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o); + } + encode(s, sig, param_n * param_k); + memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes); + *siglen = param_sig_bytes; +err: + mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX); + mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX); + mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1)); + mayo_secure_clear(r, K_MAX * O_MAX + 1); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(&sk, sizeof(sk_t)); + mayo_secure_clear(Ox, N_MINUS_O_MAX); + mayo_secure_clear(tmp, + DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1); + return ret; +} + +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + const int param_sig_bytes = PARAM_sig_bytes(p); + size_t siglen = param_sig_bytes; + ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk); + if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes) + goto err; + + memmove(sm + param_sig_bytes, m, mlen); + *smlen = siglen + mlen; +err: + return ret; +} + +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk) { + const int param_sig_bytes = PARAM_sig_bytes(p); + if (smlen < (size_t)param_sig_bytes) { + return MAYO_ERR; + } + int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm, + pk); + + if (result == MAYO_OK) { + *mlen = smlen - param_sig_bytes; + memmove(m, sm + param_sig_bytes, *mlen); + } + + return result; +} + +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk) { + int ret = MAYO_OK; + unsigned char *seed_sk = csk; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8]; + alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0}; + + unsigned char *seed_pk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_m = PARAM_m(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + // seed_sk $←- B^(sk_seed bytes) + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(seed_sk, param_sk_seed_bytes); + #else + if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // S ← shake256(seedsk, pk seed bytes + O bytes) + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + // seed_pk ← s[0 : pk_seed_bytes] + seed_pk = S; + + // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes]) + decode(S + param_pk_seed_bytes, O, param_v * param_o); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + + int m_legs = param_m / 32; + + uint64_t *P1 = P; + uint64_t *P1O_P2 = P + (param_P1_bytes / 8); + + // compute P3 = O^t * (P1*O + P2) + Ot_times_P1O_P2(p, P1, O, P1O_P2, P3); + + // store seed_pk in cpk + memcpy(cpk, seed_pk, param_pk_seed_bytes); + + alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8]; + + // compute Upper(P3) and store in cpk + m_upper(m_legs, P3, P3_upper, param_o); + + memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes); + + +#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL) +err: +#endif + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2); + return ret; +} + +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *pk) { + #ifdef MAYO_VARIANT + (void)p; + #endif + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes); + pk += param_P1_bytes + param_P2_bytes; + memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes); + return MAYO_OK; +} + +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *sk) { + int ret = MAYO_OK; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + uint64_t *P = sk->p; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_o = PARAM_o(p); + const int param_v = PARAM_v(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + const unsigned char *seed_sk = csk; + unsigned char *seed_pk = S; + + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + decode(S + param_pk_seed_bytes, O, + param_v * param_o); // O = S + PK_SEED_BYTES; + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + uint64_t *P2 = P + (param_P1_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + uint64_t *P1 = P; + // compute L_i = (P1 + P1^t)*O + P2 + uint64_t *L = P2; + P1P1t_times_O(p, P1, O, L); + + // write to sk + memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + return ret; +} + +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *cpk) { + unsigned char tEnc[M_BYTES_MAX]; + unsigned char t[M_MAX]; + unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X) + unsigned char s[K_MAX * N_MAX]; + alignas (64) uint64_t pk[EPK_BYTES_MAX / 8]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX]; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P3_bytes = PARAM_P3_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk); + if (ret != MAYO_OK) { + return MAYO_ERR; + } + + uint64_t *P1 = pk; + uint64_t *P2 = pk + (param_P1_bytes / 8); + uint64_t *P3 = P2 + (param_P2_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + P2[i] = BSWAP64(P2[i]); + } + for (int i = 0; i < param_P3_bytes / 8; ++i) { + P3[i] = BSWAP64(P3[i]); + } +#endif + + // hash m + shake256(tmp, param_digest_bytes, m, mlen); + + // compute t + memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes, + param_salt_bytes); + shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + decode(tEnc, t, param_m); + + // decode s + decode(sig, s, param_k * param_n); + + // Compute S*P*S^T + alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0}; + + m_calculate_PS_SPS(P1, P2, P3, s, param_m, + param_v, param_o, param_k, SPS); + + // combine the vectors in SPS and reduce mod f(X) + compute_rhs(p, SPS, y, y); + + if (memcmp(y, t, param_m) == 0) { + return MAYO_OK; // good signature + } + return MAYO_ERR; // bad signature +} + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.h new file mode 100644 index 0000000000..1de4bf2a33 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.h @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MAYO_H +#define MAYO_H + +#include +#include + +#define F_TAIL_LEN 5 +#define F_TAIL_64 \ + { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3 +#define F_TAIL_96 \ + { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x +#define F_TAIL_128 \ + { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2 + +#define MAYO_1_name "MAYO_1" +#define MAYO_1_n 66 +#define MAYO_1_m 64 +#define MAYO_1_o 8 +#define MAYO_1_v (MAYO_1_n - MAYO_1_o) +#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1) +#define MAYO_1_k 9 +#define MAYO_1_q 16 +#define MAYO_1_m_bytes 32 +#define MAYO_1_O_bytes 232 +#define MAYO_1_v_bytes 29 +#define MAYO_1_r_bytes 36 +#define MAYO_1_P1_bytes 54752 +#define MAYO_1_P2_bytes 14848 +#define MAYO_1_P3_bytes 1152 +#define MAYO_1_csk_bytes 24 +#define MAYO_1_esk_bytes 69856 +#define MAYO_1_cpk_bytes 1168 +#define MAYO_1_epk_bytes 70752 +#define MAYO_1_sig_bytes 321 +#define MAYO_1_f_tail F_TAIL_64 +#define MAYO_1_f_tail_arr f_tail_64 +#define MAYO_1_salt_bytes 24 +#define MAYO_1_digest_bytes 32 +#define MAYO_1_pk_seed_bytes 16 +#define MAYO_1_sk_seed_bytes 24 + +#define MAYO_2_name "MAYO_2" +#define MAYO_2_n 78 +#define MAYO_2_m 64 +#define MAYO_2_o 18 +#define MAYO_2_v (MAYO_2_n - MAYO_2_o) +#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1) +#define MAYO_2_k 4 +#define MAYO_2_q 16 +#define MAYO_2_m_bytes 32 +#define MAYO_2_O_bytes 540 +#define MAYO_2_v_bytes 30 +#define MAYO_2_r_bytes 36 +#define MAYO_2_P1_bytes 58560 +#define MAYO_2_P2_bytes 34560 +#define MAYO_2_P3_bytes 5472 +#define MAYO_2_csk_bytes 24 +#define MAYO_2_esk_bytes 93684 +#define MAYO_2_cpk_bytes 5488 +#define MAYO_2_epk_bytes 98592 +#define MAYO_2_sig_bytes 180 +#define MAYO_2_f_tail F_TAIL_64 +#define MAYO_2_f_tail_arr f_tail_64 +#define MAYO_2_salt_bytes 24 +#define MAYO_2_digest_bytes 32 +#define MAYO_2_pk_seed_bytes 16 +#define MAYO_2_sk_seed_bytes 24 + +#define MAYO_3_name "MAYO_3" +#define MAYO_3_n 99 +#define MAYO_3_m 96 +#define MAYO_3_o 10 +#define MAYO_3_v (MAYO_3_n - MAYO_3_o) +#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1) +#define MAYO_3_k 11 +#define MAYO_3_q 16 +#define MAYO_3_m_bytes 48 +#define MAYO_3_O_bytes 445 +#define MAYO_3_v_bytes 45 +#define MAYO_3_r_bytes 55 +#define MAYO_3_P1_bytes 192240 +#define MAYO_3_P2_bytes 42720 +#define MAYO_3_P3_bytes 2640 +#define MAYO_3_csk_bytes 32 +#define MAYO_3_esk_bytes 235437 +#define MAYO_3_cpk_bytes 2656 +#define MAYO_3_epk_bytes 237600 +#define MAYO_3_sig_bytes 577 +#define MAYO_3_f_tail F_TAIL_96 +#define MAYO_3_f_tail_arr f_tail_96 +#define MAYO_3_salt_bytes 32 +#define MAYO_3_digest_bytes 48 +#define MAYO_3_pk_seed_bytes 16 +#define MAYO_3_sk_seed_bytes 32 + +#define MAYO_5_name "MAYO_5" +#define MAYO_5_n 133 +#define MAYO_5_m 128 +#define MAYO_5_o 12 +#define MAYO_5_v (MAYO_5_n - MAYO_5_o) +#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1) +#define MAYO_5_k 12 +#define MAYO_5_q 16 +#define MAYO_5_m_bytes 64 +#define MAYO_5_O_bytes 726 +#define MAYO_5_v_bytes 61 +#define MAYO_5_r_bytes 72 +#define MAYO_5_P1_bytes 472384 +#define MAYO_5_P2_bytes 92928 +#define MAYO_5_P3_bytes 4992 +#define MAYO_5_csk_bytes 40 +#define MAYO_5_esk_bytes 566078 +#define MAYO_5_cpk_bytes 5008 +#define MAYO_5_epk_bytes 570304 +#define MAYO_5_sig_bytes 838 +#define MAYO_5_f_tail F_TAIL_128 +#define MAYO_5_f_tail_arr f_tail_128 +#define MAYO_5_salt_bytes 40 +#define MAYO_5_digest_bytes 64 +#define MAYO_5_pk_seed_bytes 16 +#define MAYO_5_sk_seed_bytes 40 + +#define PARAM_JOIN2_(a, b) a##_##b +#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b) +#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end) + +#if defined(MAYO_VARIANT) +#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c +#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c) +#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s) + +#if defined(MAYO_BUILD_TYPE_REF) +#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s) +#elif defined(MAYO_BUILD_TYPE_OPT) +#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s) +#elif defined(MAYO_BUILD_TYPE_AVX2) +#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s) +#else +#error "Build type not known" +#endif + +#else +#define MAYO_NAMESPACE(s) s +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define NAME_MAX mayo5 +#define N_MAX 133 +#define M_MAX 128 +#define O_MAX 18 +#define V_MAX 121 +#define K_MAX 12 +#define Q_MAX 16 +#define PK_SEED_BYTES_MAX 16 +#define SK_SEED_BYTES_MAX 40 +#define SALT_BYTES_MAX 40 +#define DIGEST_BYTES_MAX 64 +#define O_BYTES_MAX 726 +#define V_BYTES_MAX 61 +#define R_BYTES_MAX 72 +#define P1_BYTES_MAX 472384 +#define P2_BYTES_MAX 92928 +#define P3_BYTES_MAX 5472 +#define SIG_BYTES_MAX 838 +#define CPK_BYTES_MAX 5488 +#define CSK_BYTES_MAX 40 +#define ESK_BYTES_MAX 566078 +#define EPK_BYTES_MAX 570304 +#define N_MINUS_O_MAX 121 +#define M_BYTES_MAX 64 +#elif defined(MAYO_VARIANT) +#define M_MAX PARAM_NAME(m) +#define N_MAX PARAM_NAME(n) +#define O_MAX PARAM_NAME(o) +#define V_MAX PARAM_NAME(v) +#define K_MAX PARAM_NAME(k) +#define Q_MAX PARAM_NAME(q) +#define M_BYTES_MAX PARAM_NAME(m_bytes) +#define O_BYTES_MAX PARAM_NAME(O_bytes) +#define V_BYTES_MAX PARAM_NAME(v_bytes) +#define R_BYTES_MAX PARAM_NAME(r_bytes) +#define P1_BYTES_MAX PARAM_NAME(P1_bytes) +#define P2_BYTES_MAX PARAM_NAME(P2_bytes) +#define P3_BYTES_MAX PARAM_NAME(P3_bytes) +#define SIG_BYTES_MAX PARAM_NAME(sig_bytes) +#define CSK_BYTES_MAX PARAM_NAME(csk_bytes) +#define ESK_BYTES_MAX PARAM_NAME(esk_bytes) +#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes) +#define EPK_BYTES_MAX PARAM_NAME(epk_bytes) +#define N_MINUS_O_MAX (N_MAX - O_MAX) +#define SALT_BYTES_MAX PARAM_NAME(salt_bytes) +#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes) +#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes) +#define SK_SEED_BYTES_MAX SALT_BYTES_MAX +#else +#error "Parameter not specified" +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define PARAM_name(p) (p->name) +#define PARAM_m(p) (p->m) +#define PARAM_n(p) (p->n) +#define PARAM_o(p) (p->o) +#define PARAM_v(p) (p->n - p->o) +#define PARAM_A_cols(p) (p->k * p->o + 1) +#define PARAM_k(p) (p->k) +#define PARAM_q(p) (p->q) +#define PARAM_m_bytes(p) (p->m_bytes) +#define PARAM_O_bytes(p) (p->O_bytes) +#define PARAM_v_bytes(p) (p->v_bytes) +#define PARAM_r_bytes(p) (p->r_bytes) +#define PARAM_P1_bytes(p) (p->P1_bytes) +#define PARAM_P2_bytes(p) (p->P2_bytes) +#define PARAM_P3_bytes(p) (p->P3_bytes) +#define PARAM_csk_bytes(p) (p->csk_bytes) +#define PARAM_esk_bytes(p) (p->esk_bytes) +#define PARAM_cpk_bytes(p) (p->cpk_bytes) +#define PARAM_epk_bytes(p) (p->epk_bytes) +#define PARAM_sig_bytes(p) (p->sig_bytes) +#define PARAM_f_tail(p) (p->f_tail) +#define PARAM_salt_bytes(p) (p->salt_bytes) +#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes) +#define PARAM_digest_bytes(p) (p->digest_bytes) +#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes) +#elif defined(MAYO_VARIANT) +#define PARAM_name(p) PARAM_NAME(name) +#define PARAM_m(p) PARAM_NAME(m) +#define PARAM_n(p) PARAM_NAME(n) +#define PARAM_o(p) PARAM_NAME(o) +#define PARAM_v(p) PARAM_NAME(v) +#define PARAM_A_cols(p) PARAM_NAME(A_cols) +#define PARAM_k(p) PARAM_NAME(k) +#define PARAM_q(p) PARAM_NAME(q) +#define PARAM_m_bytes(p) PARAM_NAME(m_bytes) +#define PARAM_O_bytes(p) PARAM_NAME(O_bytes) +#define PARAM_v_bytes(p) PARAM_NAME(v_bytes) +#define PARAM_r_bytes(p) PARAM_NAME(r_bytes) +#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes) +#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes) +#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes) +#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes) +#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes) +#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes) +#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes) +#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes) +static const unsigned char f_tail[] = PARAM_NAME(f_tail); +#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes) +#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes) +#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes) +#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes) +#define PARAM_f_tail(p) f_tail +#else +#error "Parameter not specified" +#endif + +/** + * Struct defining MAYO parameters + */ +typedef struct { + int m; + int n; + int o; + int k; + int q; + const unsigned char *f_tail; + int m_bytes; + int O_bytes; + int v_bytes; + int r_bytes; + int R_bytes; + int P1_bytes; + int P2_bytes; + int P3_bytes; + int csk_bytes; + int esk_bytes; + int cpk_bytes; + int epk_bytes; + int sig_bytes; + int salt_bytes; + int sk_seed_bytes; + int digest_bytes; + int pk_seed_bytes; + const char *name; +} mayo_params_t; + +typedef struct sk_t { + uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8]; + uint8_t o[O_BYTES_MAX]; +} sk_t; + +/** + * MAYO parameter sets + */ +#ifdef ENABLE_PARAMS_DYNAMIC +extern const mayo_params_t MAYO_1; +extern const mayo_params_t MAYO_2; +extern const mayo_params_t MAYO_3; +extern const mayo_params_t MAYO_5; +#endif + +/** + * Status codes + */ +#define MAYO_OK 0 +#define MAYO_ERR 1 + +/** + * Mayo keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * @param[in] p Mayo parameter set + * @param[out] pk Mayo public key + * @param[out] sk Mayo secret key + * @return int status code + */ +#define mayo_keypair MAYO_NAMESPACE(mayo_keypair) +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk); + +#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature) +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk); + +/** + * MAYO signature generation. + * + * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec. + * Keys provided is a compacted secret keys. + * The caller is responsible to allocate sufficient memory to hold sm. + * + * @param[in] p Mayo parameter set + * @param[out] sm Signature concatenated with message + * @param[out] smlen Pointer to the length of sm + * @param[in] m Message to be signed + * @param[in] mlen Message length + * @param[in] sk Compacted secret key + * @return int status code + */ +#define mayo_sign MAYO_NAMESPACE(mayo_sign) +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +/** + * Mayo open signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m. + * Keys provided is a compact public key. + * The caller is responsible to allocate sufficient memory to hold m. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sm Signature concatenated with message + * @param[in] smlen Length of sm + * @param[in] pk Compacted public key + * @return int status code + */ +#define mayo_open MAYO_NAMESPACE(mayo_open) +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk); + +/** + * Mayo compact keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and + * cpk are compact representations of a Mayo secret key and public key + * + * @param[in] p Mayo parameter set + * @param[out] cpk Mayo compacted public key + * @param[out] csk Mayo compacted secret key + * @return int status code + */ +#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact) +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk); + +/** + * Mayo expand public key. + * + * The implementation corresponds to Mayo.expandPK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold epk. + * + * @param[in] p Mayo parameter set + * @param[in] cpk Compacted public key. + * @param[out] epk Expanded public key. + * @return int return code + */ +#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk) +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *epk); + +/** + * Mayo expand secret key. + * + * The implementation corresponds to Mayo.expandSK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold esk. + * + * @param[in] p Mayo parameter set + * @param[in] csk Compacted secret key. + * @param[out] esk Expanded secret key. + * @return int return code + */ +#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk) +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *esk); + +/** + * Mayo verify signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1. + * Keys provided is a compact public key. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sig Signature + * @param[in] pk Compacted public key + * @return int 0 if verification succeeded, 1 otherwise. + */ +#define mayo_verify MAYO_NAMESPACE(mayo_verify) +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *pk); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/mem.h b/src/sig/mayo/pqmayo_mayo-2_avx2/mem.h new file mode 100644 index 0000000000..2aa2196e2e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/mem.h @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MEM_H +#define MEM_H +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#define BSWAP32(i) __builtin_bswap32((i)) +#define BSWAP64(i) __builtin_bswap64((i)) +#else +#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24)) +#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)) +#endif + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint32_t ct_is_greater_than(int a, int b) { + int32_t diff = b - a; + return (uint32_t) (diff >> (8*sizeof(uint32_t)-1)); +} + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint64_t ct_64_is_greater_than(int a, int b) { + int64_t diff = ((int64_t) b) - ((int64_t) a); + return (uint64_t) (diff >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00000000, else 0xFFFFFFFF +static inline uint32_t ct_compare_32(int a, int b) { + return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF +static inline uint64_t ct_compare_64(int a, int b) { + return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00, else 0xFF +static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) { + return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +#include +/** + * Clears and frees allocated memory. + * + * @param[out] mem Memory to be cleared and freed. + * @param size Size of memory to be cleared and freed. + */ +static inline void mayo_secure_free(void *mem, size_t size) { + OQS_MEM_secure_free(mem, size); +} + +/** + * Clears memory. + * + * @param[out] mem Memory to be cleared. + * @param size Size of memory to be cleared. + */ +static inline void mayo_secure_clear(void *mem, size_t size) { + OQS_MEM_cleanse(mem, size); +} + +#endif diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/params.c b/src/sig/mayo/pqmayo_mayo-2_avx2/params.c new file mode 100644 index 0000000000..043d4cedc1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/params.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +static const unsigned char f_tail_64[] = F_TAIL_64; +static const unsigned char f_tail_96[] = F_TAIL_96; +static const unsigned char f_tail_128[] = F_TAIL_128; + +#define MAYO_GEN_PARAMS(nm) \ + const mayo_params_t nm = { \ + .m = PARAM_JOIN2(nm, m), \ + .n = PARAM_JOIN2(nm, n), \ + .o = PARAM_JOIN2(nm, o), \ + .k = PARAM_JOIN2(nm, k), \ + .q = PARAM_JOIN2(nm, q), \ + .f_tail = PARAM_JOIN2(nm, f_tail_arr), \ + .m_bytes = PARAM_JOIN2(nm, m_bytes), \ + .O_bytes = PARAM_JOIN2(nm, O_bytes), \ + .v_bytes = PARAM_JOIN2(nm, v_bytes), \ + .r_bytes = PARAM_JOIN2(nm, r_bytes), \ + .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \ + .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \ + .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \ + .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \ + .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \ + .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \ + .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \ + .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \ + .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \ + .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \ + .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \ + .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \ + .name = #nm \ + }; + +MAYO_GEN_PARAMS(MAYO_1); +MAYO_GEN_PARAMS(MAYO_2); +MAYO_GEN_PARAMS(MAYO_3); +MAYO_GEN_PARAMS(MAYO_5); +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_128.h new file mode 100644 index 0000000000..27b416adce --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_128.h @@ -0,0 +1,524 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_128_H +#define SHUFFLE_ARITHMETIC_128_H + +#include +#include +#include +#include + + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + cols_used ++; + __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[(2*r*O_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); + acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0; + acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1; + } + } +} + +static +inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){ + const __m256i *P1O_P2 = (__m256i *) _P1O_P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); + acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1; + } + } +} + + +static +inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + cols_used += 1; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + pos += (V_MAX -c - 1); + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k )); + __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1)); + __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2)); + __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3)); + + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k), acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1); + } + } +} + + +static +inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *L = (__m256i *) _L; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +static +inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + +static +inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *Pv = (__m256i *) _Pv; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + + // P1 * S1 + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + P1_cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1); + } + } + + // P2 * S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P3 = (__m256i *) _P3; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + for (c=r; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +static +inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc); +} + +static +inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){ + const __m256i *PS2 = (__m256i *) _PS2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_64.h new file mode 100644 index 0000000000..defff86f8f --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_64.h @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_64_H +#define SHUFFLE_ARITHMETIC_64_H + +#include +#include +#include +#include + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t; + } + } +} + + +static +inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1O_P2 = (__m256i *) _P1O_P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t; + } + } +} + +static +inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + cols_used += 1; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + pos); + pos += (V_MAX -c - 1); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k )); + __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1)); + + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + + _mm256_storeu_si256(acc + (r*O_MAX + k ), acc0 ^ temp[k ] ^ _mm256_slli_epi16(t,4)); + _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t); + } + } +} + + +static +inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *L = (__m256i *) _L; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +static +inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *Pv = (__m256i *) _Pv; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +static +inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular +// same as mayo_12_P1_times_Vt_avx2 +static +inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc); +} + +static +inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc); +} + +static +inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){ + const __m256i *PS2 = (__m256i *) _PS2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=0; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P2 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + + // P1 * S1 + for (c=r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + P1_cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even); + } + } + + // P2 * S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P3 = (__m256i *) _P3; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=r; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P3 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +static inline +void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { + (void) size; + int m_vecs_stored = 0; + + for (int r = 0; r < O_MAX; ++r) { + const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r)); + __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored); + _out[0] = _in[0]; + m_vecs_stored++; + for (int c = r + 1; c < O_MAX; ++c) { + const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c)); + const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r)); + _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored); + _out[0] = _in2[0] ^ _in3[0]; + m_vecs_stored++; + } + } +} + + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_96.h new file mode 100644 index 0000000000..9b3a69d567 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_96.h @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_96_H +#define SHUFFLE_ARITHMETIC_96_H + +#include +#include +#include +#include + + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + cols_used ++; + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + pos += (V_MAX -c - 1); + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + // P1 times S1 + __m256i temp[K_OVER_2*2*2] = {0}; + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + P1_cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1); + } + } + + // P2 times S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (c=r; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){ + mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc); +} + +static +inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-2_avx2/simple_arithmetic.h new file mode 100644 index 0000000000..ce7580f4c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_avx2/simple_arithmetic.h @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SIMPLE_ARITHMETIC_H +#define SIMPLE_ARITHMETIC_H + +// GF(16) multiplication mod x^4 + x + 1 +static inline unsigned char mul_f(unsigned char a, unsigned char b) { + // carryless multiply + unsigned char p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + unsigned char top_p = p & 0xf0; + unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f; + return out; +} + +static inline uint64_t mul_fx8(unsigned char a, uint64_t b) { + // carryless multiply + uint64_t p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0; + uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f; + return out; +} + +// GF(16) addition +static inline unsigned char add_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) subtraction +static inline unsigned char sub_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) negation +static inline unsigned char neg_f(unsigned char a) { + return a; +} + +static inline unsigned char inverse_f(unsigned char a) { + // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5, + // 10, 4, 3, 8}; return table[a & 15]; + + unsigned char a2 = mul_f(a, a); + unsigned char a4 = mul_f(a2, a2); + unsigned char a8 = mul_f(a4, a4); + unsigned char a6 = mul_f(a2, a4); + unsigned char a14 = mul_f(a8, a6); + + return a14; +} + +static inline unsigned char lincomb(const unsigned char *a, + const unsigned char *b, int n, int m) { + unsigned char ret = 0; + for (int i = 0; i < n; ++i, b += m) { + ret = add_f(mul_f(a[i], *b), ret); + } + return ret; +} + +static inline void mat_mul(const unsigned char *a, const unsigned char *b, + unsigned char *c, int colrow_ab, int row_a, int col_b) { + for (int i = 0; i < row_a; ++i, a += colrow_ab) { + for (int j = 0; j < col_b; ++j, ++c) { + *c = lincomb(a, b + j, colrow_ab, col_b); + } + } +} + +static inline void mat_add(const unsigned char *a, const unsigned char *b, + unsigned char *c, int m, int n) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j)); + } + } +} + +static +inline void m_vec_copy(int m_legs, const uint64_t *in, + uint64_t *out) { + for (int i = 0; i < m_legs * 2; i++) { + out[i] = in[i]; + } +} + +static +inline void m_vec_add(int m_legs, const uint64_t *in, + uint64_t *acc) { + for (int i = 0; i < m_legs * 2; i++) { + acc[i] ^= in[i]; + } +} + +// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1) +// gf16 := gf2[x]/(x^4+x+1) + +static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) { + uint32_t a_msb; + uint32_t a32 = a; + uint32_t b32 = b; + uint32_t r32 = a32 * (b32 & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 1) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 2) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 3) & 1); + + return r32; + +} + +static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < m_legs*2;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) { + for(int i=0;i +#include + +void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output); +#define AES_ECB_encrypt AES_256_ECB + +#ifdef ENABLE_AESNI +int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +#define AES_128_CTR AES_128_CTR_NI +#else +#include +static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen) { + (void) inputByteLen; + uint8_t iv[12] = { 0 }; + aes128ctr_prf(output, outputByteLen, input, iv); + return (int) outputByteLen; +} +#endif + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/api.c b/src/sig/mayo/pqmayo_mayo-2_opt/api.c new file mode 100644 index 0000000000..a7cf85eedf --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/api.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +#define MAYO_PARAMS &MAYO_2 +#else +#define MAYO_PARAMS 0 +#endif + +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { + return mayo_keypair(MAYO_PARAMS, pk, sk); +} + +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk) { + return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk); +} + +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk) { + return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk); +} + +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk) { + return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk); +} + +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk) { + if (siglen != CRYPTO_BYTES) + return -1; + return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk); +} + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/api.h b/src/sig/mayo/pqmayo_mayo-2_opt/api.h new file mode 100644 index 0000000000..265a5639db --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/api.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef api_h +#define api_h + +#include + +#define CRYPTO_SECRETKEYBYTES 24 +#define CRYPTO_PUBLICKEYBYTES 5488 +#define CRYPTO_BYTES 180 + +#define CRYPTO_ALGNAME "MAYO-2" + +#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair) +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +#define crypto_sign MAYO_NAMESPACE(crypto_sign) +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk); + +#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature) +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open) +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk); + +#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify) +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk); + +#endif /* api_h */ + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.c new file mode 100644 index 0000000000..de09954c8a --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { +#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64) + mayo12_m_upper(m_legs, in, out, size); +#else + int m_vecs_stored = 0; + for (int r = 0; r < size; r++) { + for (int c = r; c < size; c++) { + m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored ); + if (r != c) { + m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored ); + } + m_vecs_stored ++; + } + } +#endif +} + +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){ +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + (void) p; + mayo_12_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + (void) p; + mayo_3_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + (void) p; + mayo_5_P1P1t_times_O(P1, O, acc); +#else + #ifndef MAYO_VARIANT + const int m_legs = PARAM_m(p) / 32; + #else + (void) p; + #endif + const int param_o = PARAM_o(p); + const int param_v = PARAM_n(p) - PARAM_o(p);; + + int + bs_mat_entries_used = 0; + for (int r = 0; r < param_v; r++) { + for (int c = r; c < param_v; c++) { + if(c==r) { + bs_mat_entries_used += 1; + continue; + } + for (int k = 0; k < param_o; k += 1) { + +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k)); + vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k)); + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k)); + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k)); +#else + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k)); + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k)); +#endif + + } + bs_mat_entries_used += 1; + } + } +#endif +} + +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_12_Vt_times_L_avx2(L, V_multabs, M); + mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_3_Vt_times_L_avx2(L, V_multabs, M); + mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_5_Vt_times_L_avx2(L, V_multabs, M); + mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mul_add_mat_x_m_mat(param_m / 32, V, L, M, + param_k, param_n - param_o, param_o); + + mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1); + + mul_add_mat_x_m_mat(param_m / 32, V, Pv, + Y, param_k, param_n - param_o, + param_k); +#endif +} + +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int m_legs = PARAM_m(p) / 32; + mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1); + mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o); +#endif +} + + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ] +// [ P3*S2 = P2 ] +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS) { + (void) m; +#if MAYO_AVX + const int n = o + v; + + /* Old approach which is constant time but doesn't have to be */ + unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K + unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + +#if M_MAX == 64 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 96 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 128 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS); + mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#else + NOT IMPLEMENTED +#endif +#else + const int n = o + v; + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS); + mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS); +#endif +} + + +// sample a solution x to Ax = y, with r used as randomness +// require: +// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are +// not important, they will be overwritten by y) in row major order +// - y is a vector with m elements +// - r and x are k*o bytes long +// return: 1 on success, 0 on failure +int sample_solution(const mayo_params_t *p, unsigned char *A, + const unsigned char *y, const unsigned char *r, + unsigned char *x, int k, int o, int m, int A_cols) { + #ifdef MAYO_VARIANT + (void) p; + #endif + unsigned char finished; + int col_upper_bound; + unsigned char correct_column; + + // x <- r + for (int i = 0; i < k * o; i++) { + x[i] = r[i]; + } + + // compute Ar; + unsigned char Ar[M_MAX]; + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = 0; // clear last col of A + } + mat_mul(A, r, Ar, k * o + 1, m, 1); + + // move y - Ar to last column of matrix A + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]); + } + + EF(A, m, k * o + 1); + + // check if last row of A (excluding the last entry of y) is zero + unsigned char full_rank = 0; + for (int i = 0; i < A_cols - 1; i++) { + full_rank |= A[(m - 1) * A_cols + i]; + } + +// It is okay to leak if we need to restart or not +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1); +#endif + + if (full_rank == 0) { + return 0; + } + + // back substitution in constant time + // the index of the first nonzero entry in each row is secret, which makes + // things less efficient + + for (int row = m - 1; row >= 0; row--) { + finished = 0; + col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o); + // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32} + + for (int col = row; col <= col_upper_bound; col++) { + + // Compare two chars in constant time. + // Returns 0x00 if the byte arrays are equal, 0xff otherwise. + correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished; + + unsigned char u = correct_column & A[row * A_cols + A_cols - 1]; + x[col] ^= u; + + for (int i = 0; i < row; i += 8) { + uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8) + ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24) + ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40) + ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56); + + tmp = mul_fx8(u, tmp); + + A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf; + A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf; + A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf; + A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf; + A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf; + A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf; + A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf; + A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf; + } + + finished = finished | correct_column; + } + } + return 1; +} + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.h new file mode 100644 index 0000000000..c2fb4fe334 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.h @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_H +#define ARITHMETIC_H + +#include +#include +#include + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define TARGET_BIG_ENDIAN +#endif + +#if defined(MAYO_AVX) && (M_MAX == 64) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 96) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 128) + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128)) + #include +#endif + +// Calculate P3 = O^T * (P1*O + P2) in KeyGen +#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2) +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3); + +// Calculate Upper in KeyGen +#define m_upper MAYO_NAMESPACE(m_upper) +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size); + +// Calculate acc = (P1+P1^T)*O in expand_sk +#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O) +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc); + +// Calculate M=V*L and Y=V*P1*V^T in Sign +#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt) +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y); + +// Sample solution in Sign +#define sample_solution MAYO_NAMESPACE(sample_solution) +int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols); + +// Calculate SPS = S*P*S^T in Verify +#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS) +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_128.h new file mode 100644 index 0000000000..418c308e2f --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_128.h @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_128_H +#define ARITHMETIC_128_H + +#include +#include +#include + +// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_128(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; +} + + +static +inline void vec_add_128(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; + acc[6] ^= in[6]; + acc[7] ^= in[7]; +} + +inline +static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 8;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 8;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 8;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void multiply_bins_128(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8); + m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8); + m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8); + m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8); + m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8); + m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8); + m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8); + m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8); + m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8); + m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8); + m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8); + m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8); + m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8); + m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8); + vec_copy_128(bins + 8, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_64.h new file mode 100644 index 0000000000..a70b7a3118 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_64.h @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_64_H +#define ARITHMETIC_64_H + +#include +#include + +// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_64(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; +} + +static +inline void vec_add_64(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; +} + +static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) { + uint64_t mask_msb = 0x8888888888888888ULL; + uint64_t a_msb; + uint64_t a64 = a; + uint64_t b32 = b; + uint64_t r64 = a64 * (b32 & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 1) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 2) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 3) & 1); + + return r64; +} + +static inline uint32_t mul_table(uint8_t b){ + uint32_t x = ((uint32_t) b) * 0x08040201; + + uint32_t high_nibble_mask = 0xf0f0f0f0; + + uint32_t high_half = x & high_nibble_mask; + return (x ^ (high_half >> 4) ^ (high_half >> 3)); +} + +static +inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 4;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < legs; i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +inline +static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 4;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 4;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void multiply_bins_64(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4); + m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4); + m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4); + m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4); + m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4); + m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4); + m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4); + m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4); + m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4); + m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4); + m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4); + m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4); + m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4); + m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4); + vec_copy_64(bins + 4, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_96.h new file mode 100644 index 0000000000..a38f89e454 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_96.h @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_96_H +#define ARITHMETIC_96_H + +#include +#include +#include + +// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_96(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; +} + +static +inline void vec_add_96(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; +} + +inline +static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 6;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 6;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 6;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void multiply_bins_96(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6); + m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6); + m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6); + m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6); + m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6); + m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6); + m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6); + m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6); + m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6); + m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6); + m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6); + m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6); + m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6); + m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6); + vec_copy_96(bins + 6, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_common.h new file mode 100644 index 0000000000..d337bc238c --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_common.h @@ -0,0 +1,469 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_COMMON_H +#define ARITHMETIC_COMMON_H + +#include + +#ifndef MAYO_VARIANT +static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) { + + m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2); + m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 3 * m_legs * 2); + + m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 6 * m_legs * 2); + + m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2); + m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 7 * m_legs * 2); + + m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 4 * m_legs * 2); + + m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 9 * m_legs * 2); + m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 3 * m_legs * 2); + + m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 2 * m_legs * 2); + m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2); + m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_copy(m_legs, bins + 1 * m_legs * 2, out); +} +#endif + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *PS) { + + const int n = o + v; +#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128) + (void)m; +#else + const int m_legs = m / 32; +#endif + + /* Old approach which is constant time but doesn't have to be + unsigned char S1[V_MAX*K_MAX]; + unsigned char S2[O_MAX*K_MAX]; + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1 + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2 + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2. + */ + + // use more stack efficient version for MAYO_3 and MAYO_5 + #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78 + uint64_t accumulator[M_MAX * N_MAX] = {0}; + int P1_used; + int P3_used; + for (int col = 0; col < k; col++) { + for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) { + accumulator[i] = 0; + } + P1_used = 0; + for (int row = 0; row < v; row++) { + for (int j = row; j < v; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8); +#else + bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 ); +#endif + P1_used ++; + } + + for (int j = 0; j < o; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 ); +#else + bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 ); +#endif + } + } + + P3_used = 0; + for (int row = v; row < n; row++) { + for (int j = row; j < n; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8); +#else + bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 ); +#endif + P3_used ++; + } + } + + for (int row = 0; row < n; row++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8); +#else + bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2); +#endif + } + } + + #else + + alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0}; + int P1_used = 0; + for (int row = 0; row < v; row++) { + for (int j = row; j < v; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 ); + } +#endif + P1_used ++; + } + + + for (int j = 0; j < o; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 ); + } +#endif + } + } + + int P3_used = 0; + for (int row = v; row < n; row++) { + for (int j = row; j < n; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 ); + } +#endif + P3_used ++; + } + } + + // multiply stuff according to the bins of the accumulator and add to PS. + int i = 0; + while (i < n * k) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8); + i++; +#else + m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2); + i++; +#endif + } + + #endif +} + + +static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int n, uint64_t *SPS){ + alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0}; + #if !defined(MAYO_VARIANT) + const int m_legs = m/32; + #else + (void) m; + #endif + for (int row = 0; row < k; row++) { + for (int j = 0; j < n; j++) { + for (int col = 0; col < k; col += 1) { + #if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 ); + #elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 ); + #elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 ); + #else + m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 ); + #endif + } + } + } + + // multiply stuff according to the bins of the accumulator and add to PS. + int i = 0; + while (i < k*k) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8); + i++; +#else + m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2); + i++; +#endif + } +} + + +// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc +static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) { + + int bs_mat_entries_used = 0; + for (int r = 0; r < bs_mat_rows; r++) { + for (int c = triangular * r; c < bs_mat_cols; c++) { + for (int k = 0; k < mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k)); +#endif + } + bs_mat_entries_used += 1; + } + } +} + +// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc +static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) { + + int bs_mat_entries_used = 0; + for (int r = 0; r < bs_mat_rows; r++) { + for (int c = triangular * r; c < bs_mat_cols; c++) { + for (int k = 0; k < mat_rows; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k)); +#endif + } + bs_mat_entries_used += 1; + } + } +} + +// multiplies the transpose of a single matrix with m matrices and adds result to acc +static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) { + + for (int r = 0; r < mat_cols; r++) { + for (int c = 0; c < mat_rows; c++) { + for (int k = 0; k < bs_mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k)); +#endif + } + } + } +} + +// multiplies a single matrix with m matrices and adds result to acc +static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) { + + for (int r = 0; r < mat_rows; r++) { + for (int c = 0; c < mat_cols; c++) { + for (int k = 0; k < bs_mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k)); +#endif + } + } + } +} +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo-2_opt/echelon_form.h new file mode 100644 index 0000000000..82505847c9 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/echelon_form.h @@ -0,0 +1,152 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ECHELON_FORM_H +#define ECHELON_FORM_H + +#include +#include +#include +#include + +#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) + +static inline unsigned char +m_extract_element(const uint64_t *in, int index) { + const int leg = index / 16; + const int offset = index % 16; + + return (in[leg] >> (offset*4)) & 0xF; +} + +static inline void +ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) { + int i; + unsigned char *out8 = (unsigned char *)out; + for(i = 0; i+1 < ncols; i += 2){ +#ifdef TARGET_BIG_ENDIAN + out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0) | (in[i+1] << 4); +#else + out8[i/2] = (in[i+0] << 0) | (in[i+1] << 4); +#endif + } + if (ncols % 2 == 1){ +#ifdef TARGET_BIG_ENDIAN + out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0); +#else + out8[i/2] = (in[i+0] << 0); +#endif + } +} + +static inline void +ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) { + const unsigned char *in8 = (const unsigned char *)in; + for(int i = 0; i < legs * 16; i += 2){ +#ifdef TARGET_BIG_ENDIAN + out[i] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF; + out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + out[i] = (in8[i/2]) & 0xF; + out[i+1] = (in8[i/2] >> 4); +#endif + } +} + + +// put matrix in row echelon form with ones on first nonzero entries *in +// constant time* +static inline void EF(unsigned char *A, int nrows, int ncols) { + + alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16]; + alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16]; + alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 }; + + int row_len = (ncols + 15) / 16; + + // nibbleslice the matrix A + for (int i = 0; i < nrows; i++) { + ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols); + } + + // pivot row is secret, pivot col is not + + unsigned char inverse; + int pivot_row = 0; + for (int pivot_col = 0; pivot_col < ncols; pivot_col++) { + + int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols); + int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col); + // the pivot row is guaranteed to be between these lower and upper bounds if + // A has full rank + + // zero out pivot row + for (int i = 0; i < row_len; i++) { + _pivot_row[i] = 0; + _pivot_row2[i] = 0; + } + + // try to get a pivot row in constant time + unsigned char pivot = 0; + uint64_t pivot_is_zero = -1; + for (int row = pivot_row_lower_bound; + row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) { + + uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row); + uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row); + + for (int j = 0; j < row_len; j++) { + _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) & + packed_A[row * row_len + j]; + } + pivot = m_extract_element(_pivot_row, pivot_col); + pivot_is_zero = ~ct_compare_64((int) pivot, 0); + } + + // multiply pivot row by inverse of pivot + inverse = inverse_f(pivot); + vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2); + + // conditionally write pivot row to the correct row, if there is a nonzero + // pivot + for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) { + uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero; + uint64_t do_not_copy = ~do_copy; + for (int col = 0; col < row_len; col++) { + packed_A[row * row_len + col] = + (do_not_copy & packed_A[row * row_len + col]) + + (do_copy & _pivot_row2[col]); + } + } + + // eliminate entries below pivot + for (int row = pivot_row_lower_bound; row < nrows; row++) { + unsigned char below_pivot = (row > pivot_row); + unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col); + + vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim, + packed_A + row * row_len); + } + + pivot_row += (-(int64_t)(~pivot_is_zero)); + } + + unsigned char temp[(O_MAX * K_MAX + 1 + 15)]; + + // unbitslice the matrix A + for (int i = 0; i < nrows; i++) { + ef_unpack_m_vec(row_len, packed_A + i * row_len, temp); + for (int j = 0; j < ncols; j++) { + A[i * ncols + j] = temp[j]; + } + } + + mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15); + mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8); + mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8); + mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/mayo.c b/src/sig/mayo/pqmayo_mayo-2_opt/mayo.c new file mode 100644 index 0000000000..ce1ccd4c71 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/mayo.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define PK_PRF AES_128_CTR + +static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) { + int i; + for (i = 0; i < mdeclen / 2; ++i) { + *mdec++ = m[i] & 0xf; + *mdec++ = m[i] >> 4; + } + + if (mdeclen % 2 == 1) { + *mdec++ = m[i] & 0x0f; + } +} + +static void encode(const unsigned char *m, unsigned char *menc, int mlen) { + int i; + for (i = 0; i < mlen / 2; ++i, m += 2) { + menc[i] = (*m) | (*(m + 1) << 4); + } + + if (mlen % 2 == 1) { + menc[i] = (*m); + } +} + +static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *vPv = _vPv; + uint64_t temp[M_MAX/16] = {0}; + unsigned char *temp_bytes = (unsigned char *) temp; + int k = 0; + for (int i = PARAM_k(p) - 1; i >= 0 ; i--) { + for (int j = i; j < PARAM_k(p); j++) { + // multiply by X (shift up 4 bits) + unsigned char top = temp[k] >> 60; + temp[k] <<= 4; + k--; + for(; k>=0; k--){ + temp[k+1] ^= temp[k] >> 60; + temp[k] <<= 4; + } + // reduce mod f(X) + for (int jj = 0; jj < F_TAIL_LEN; jj++) { + if(jj%2 == 0){ +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#endif + } + else { +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#endif + } + } + + // extract from vPv and add + for(k=0; k < PARAM_m(p)/16; k ++){ + temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]); + } + k--; + } + } + + // add to y + for (int i = 0; i < PARAM_m(p); i+=2) + { +#ifdef TARGET_BIG_ENDIAN + y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + y[i] = t[i] ^ (temp_bytes[i/2] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4); +#endif + + } +} + +static void transpose_16x16_nibbles(uint64_t *M){ + static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f; + static const uint64_t even_bytes = 0x00ff00ff00ff00ff; + static const uint64_t even_2bytes = 0x0000ffff0000ffff; + static const uint64_t even_half = 0x00000000ffffffff; + + for (size_t i = 0; i < 16; i+=2) + { + uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; + M[i ] ^= t << 4; + M[i+1] ^= t; + } + + for (size_t i = 0; i < 16; i+=4) + { + uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes; + uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; + M[i ] ^= (t0 << 8); + M[i+1] ^= (t1 << 8); + M[i+2] ^= t0; + M[i+3] ^= t1; + } + + for (size_t i = 0; i < 4; i++) + { + uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes; + uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes; + + M[i ] ^= t0 << 16; + M[i+ 8] ^= t1 << 16; + M[i+ 4] ^= t0; + M[i+12] ^= t1; + } + + for (size_t i = 0; i < 8; i++) + { + uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; + M[i ] ^= t << 32; + M[i+8] ^= t; + } +} + +#define MAYO_M_OVER_8 ((M_MAX + 7) / 8) + +static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *VtL = _VtL; + int bits_to_shift = 0; + int words_to_shift = 0; + uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0}; + size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16; + const uint64_t *Mi, *Mj; + + for (int i = 0; i <= PARAM_k(p) - 1; ++i) { + for (int j = PARAM_k(p) - 1; j >= i; --j) { + // add the M_i and M_j to A, shifted "down" by l positions + Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + + if (i != j) { + Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + } + + bits_to_shift += 4; + if(bits_to_shift == 64){ + words_to_shift ++; + bits_to_shift = 0; + } + } + } + + for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16) + { + transpose_16x16_nibbles(A + c); + } + + unsigned char tab[F_TAIL_LEN*4] = {0}; + for (size_t i = 0; i < F_TAIL_LEN; i++) + { + tab[4*i] = mul_f(PARAM_f_tail(p)[i],1); + tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2); + tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4); + tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8); + } + + uint64_t low_bit_in_nibble = 0x1111111111111111; + + for (size_t c = 0; c < A_width; c+= 16) + { + for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++) + { + size_t pos = (r/16)*A_width + c + (r%16); + uint64_t t0 = A[pos] & low_bit_in_nibble; + uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble; + uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble; + uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble; + for (size_t t = 0; t < F_TAIL_LEN; t++) + { + A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3]; + } + } + } + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) + A[i] = BSWAP64(A[i]); +#endif + + for (int r = 0; r < PARAM_m(p); r+=16) + { + for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16) + { + for (size_t i = 0; i < 16; i++) + { + decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c)); + } + } + } +} + +// Public API + +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) { + int ret = 0; + + ret = mayo_keypair_compact(p, pk, sk); + if (ret != MAYO_OK) { + goto err; + } + +err: + return ret; +} + +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data + unsigned char y[M_MAX]; // secret data + unsigned char salt[SALT_BYTES_MAX]; // not secret data + unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX], + Vdec[N_MINUS_O_MAX * K_MAX]; // secret data + unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data + unsigned char x[K_MAX * N_MAX]; // not secret data + unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data + unsigned char s[K_MAX * N_MAX]; // not secret data + const unsigned char *seed_sk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data + alignas(32) sk_t sk; // secret data + unsigned char Ox[N_MINUS_O_MAX]; // secret data + // unsigned char Mdigest[DIGEST_BYTES]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1]; + unsigned char *ctrbyte; + unsigned char *vi; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_v_bytes = PARAM_v_bytes(p); + const int param_r_bytes = PARAM_r_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P2_bytes = PARAM_P2_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_A_cols = PARAM_A_cols(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + ret = mayo_expand_sk(p, csk, &sk); + if (ret != MAYO_OK) { + goto err; + } + + seed_sk = csk; + decode(sk.o, O, (param_n - param_o) * param_o); + + // hash message + shake256(tmp, param_digest_bytes, m, mlen); + + uint64_t *P1 = sk.p; + uint64_t *L = P1 + (param_P1_bytes/8); + alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0}; + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + L[i] = BSWAP64(L[i]); + } +#endif + + // choose the randomizer + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(tmp + param_digest_bytes, param_salt_bytes); + #else + if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // hashing to salt + memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk, + param_sk_seed_bytes); + shake256(salt, param_salt_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret +#endif + + // hashing to t + memcpy(tmp + param_digest_bytes, salt, param_salt_bytes); + ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes; + + shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + + decode(tenc, t, param_m); // may not be necessary + + for (int ctr = 0; ctr <= 255; ++ctr) { + *ctrbyte = (unsigned char)ctr; + + shake256(V, param_k * param_v_bytes + param_r_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1); + + // decode the v_i vectors + for (int i = 0; i <= param_k - 1; ++i) { + decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o), + param_n - param_o); + } + + // compute all the V * L matrices. + // compute all the V * P1 * V^T matrices. + alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0}; + V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y); + + compute_rhs(p, Y, t, y); + compute_A(p, Mtmp, A); + + decode(V + param_k * param_v_bytes, r, + param_k * + param_o); + if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) { + break; + } else { + memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t)); + } + } + + // s is already 0 + // TODO: optimize this? + for (int i = 0; i <= param_k - 1; ++i) { + vi = Vdec + i * (param_n - param_o); + mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1); + mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1); + memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o); + } + encode(s, sig, param_n * param_k); + memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes); + *siglen = param_sig_bytes; +err: + mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX); + mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX); + mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1)); + mayo_secure_clear(r, K_MAX * O_MAX + 1); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(&sk, sizeof(sk_t)); + mayo_secure_clear(Ox, N_MINUS_O_MAX); + mayo_secure_clear(tmp, + DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1); + return ret; +} + +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + const int param_sig_bytes = PARAM_sig_bytes(p); + size_t siglen = param_sig_bytes; + ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk); + if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes) + goto err; + + memmove(sm + param_sig_bytes, m, mlen); + *smlen = siglen + mlen; +err: + return ret; +} + +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk) { + const int param_sig_bytes = PARAM_sig_bytes(p); + if (smlen < (size_t)param_sig_bytes) { + return MAYO_ERR; + } + int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm, + pk); + + if (result == MAYO_OK) { + *mlen = smlen - param_sig_bytes; + memmove(m, sm + param_sig_bytes, *mlen); + } + + return result; +} + +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk) { + int ret = MAYO_OK; + unsigned char *seed_sk = csk; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8]; + alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0}; + + unsigned char *seed_pk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_m = PARAM_m(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + // seed_sk $←- B^(sk_seed bytes) + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(seed_sk, param_sk_seed_bytes); + #else + if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // S ← shake256(seedsk, pk seed bytes + O bytes) + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + // seed_pk ← s[0 : pk_seed_bytes] + seed_pk = S; + + // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes]) + decode(S + param_pk_seed_bytes, O, param_v * param_o); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + + int m_legs = param_m / 32; + + uint64_t *P1 = P; + uint64_t *P1O_P2 = P + (param_P1_bytes / 8); + + // compute P3 = O^t * (P1*O + P2) + Ot_times_P1O_P2(p, P1, O, P1O_P2, P3); + + // store seed_pk in cpk + memcpy(cpk, seed_pk, param_pk_seed_bytes); + + alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8]; + + // compute Upper(P3) and store in cpk + m_upper(m_legs, P3, P3_upper, param_o); + + memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes); + + +#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL) +err: +#endif + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2); + return ret; +} + +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *pk) { + #ifdef MAYO_VARIANT + (void)p; + #endif + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes); + pk += param_P1_bytes + param_P2_bytes; + memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes); + return MAYO_OK; +} + +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *sk) { + int ret = MAYO_OK; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + uint64_t *P = sk->p; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_o = PARAM_o(p); + const int param_v = PARAM_v(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + const unsigned char *seed_sk = csk; + unsigned char *seed_pk = S; + + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + decode(S + param_pk_seed_bytes, O, + param_v * param_o); // O = S + PK_SEED_BYTES; + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + uint64_t *P2 = P + (param_P1_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + uint64_t *P1 = P; + // compute L_i = (P1 + P1^t)*O + P2 + uint64_t *L = P2; + P1P1t_times_O(p, P1, O, L); + + // write to sk + memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + return ret; +} + +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *cpk) { + unsigned char tEnc[M_BYTES_MAX]; + unsigned char t[M_MAX]; + unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X) + unsigned char s[K_MAX * N_MAX]; + alignas (64) uint64_t pk[EPK_BYTES_MAX / 8]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX]; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P3_bytes = PARAM_P3_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk); + if (ret != MAYO_OK) { + return MAYO_ERR; + } + + uint64_t *P1 = pk; + uint64_t *P2 = pk + (param_P1_bytes / 8); + uint64_t *P3 = P2 + (param_P2_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + P2[i] = BSWAP64(P2[i]); + } + for (int i = 0; i < param_P3_bytes / 8; ++i) { + P3[i] = BSWAP64(P3[i]); + } +#endif + + // hash m + shake256(tmp, param_digest_bytes, m, mlen); + + // compute t + memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes, + param_salt_bytes); + shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + decode(tEnc, t, param_m); + + // decode s + decode(sig, s, param_k * param_n); + + // Compute S*P*S^T + alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0}; + + m_calculate_PS_SPS(P1, P2, P3, s, param_m, + param_v, param_o, param_k, SPS); + + // combine the vectors in SPS and reduce mod f(X) + compute_rhs(p, SPS, y, y); + + if (memcmp(y, t, param_m) == 0) { + return MAYO_OK; // good signature + } + return MAYO_ERR; // bad signature +} + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/mayo.h b/src/sig/mayo/pqmayo_mayo-2_opt/mayo.h new file mode 100644 index 0000000000..1de4bf2a33 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/mayo.h @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MAYO_H +#define MAYO_H + +#include +#include + +#define F_TAIL_LEN 5 +#define F_TAIL_64 \ + { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3 +#define F_TAIL_96 \ + { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x +#define F_TAIL_128 \ + { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2 + +#define MAYO_1_name "MAYO_1" +#define MAYO_1_n 66 +#define MAYO_1_m 64 +#define MAYO_1_o 8 +#define MAYO_1_v (MAYO_1_n - MAYO_1_o) +#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1) +#define MAYO_1_k 9 +#define MAYO_1_q 16 +#define MAYO_1_m_bytes 32 +#define MAYO_1_O_bytes 232 +#define MAYO_1_v_bytes 29 +#define MAYO_1_r_bytes 36 +#define MAYO_1_P1_bytes 54752 +#define MAYO_1_P2_bytes 14848 +#define MAYO_1_P3_bytes 1152 +#define MAYO_1_csk_bytes 24 +#define MAYO_1_esk_bytes 69856 +#define MAYO_1_cpk_bytes 1168 +#define MAYO_1_epk_bytes 70752 +#define MAYO_1_sig_bytes 321 +#define MAYO_1_f_tail F_TAIL_64 +#define MAYO_1_f_tail_arr f_tail_64 +#define MAYO_1_salt_bytes 24 +#define MAYO_1_digest_bytes 32 +#define MAYO_1_pk_seed_bytes 16 +#define MAYO_1_sk_seed_bytes 24 + +#define MAYO_2_name "MAYO_2" +#define MAYO_2_n 78 +#define MAYO_2_m 64 +#define MAYO_2_o 18 +#define MAYO_2_v (MAYO_2_n - MAYO_2_o) +#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1) +#define MAYO_2_k 4 +#define MAYO_2_q 16 +#define MAYO_2_m_bytes 32 +#define MAYO_2_O_bytes 540 +#define MAYO_2_v_bytes 30 +#define MAYO_2_r_bytes 36 +#define MAYO_2_P1_bytes 58560 +#define MAYO_2_P2_bytes 34560 +#define MAYO_2_P3_bytes 5472 +#define MAYO_2_csk_bytes 24 +#define MAYO_2_esk_bytes 93684 +#define MAYO_2_cpk_bytes 5488 +#define MAYO_2_epk_bytes 98592 +#define MAYO_2_sig_bytes 180 +#define MAYO_2_f_tail F_TAIL_64 +#define MAYO_2_f_tail_arr f_tail_64 +#define MAYO_2_salt_bytes 24 +#define MAYO_2_digest_bytes 32 +#define MAYO_2_pk_seed_bytes 16 +#define MAYO_2_sk_seed_bytes 24 + +#define MAYO_3_name "MAYO_3" +#define MAYO_3_n 99 +#define MAYO_3_m 96 +#define MAYO_3_o 10 +#define MAYO_3_v (MAYO_3_n - MAYO_3_o) +#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1) +#define MAYO_3_k 11 +#define MAYO_3_q 16 +#define MAYO_3_m_bytes 48 +#define MAYO_3_O_bytes 445 +#define MAYO_3_v_bytes 45 +#define MAYO_3_r_bytes 55 +#define MAYO_3_P1_bytes 192240 +#define MAYO_3_P2_bytes 42720 +#define MAYO_3_P3_bytes 2640 +#define MAYO_3_csk_bytes 32 +#define MAYO_3_esk_bytes 235437 +#define MAYO_3_cpk_bytes 2656 +#define MAYO_3_epk_bytes 237600 +#define MAYO_3_sig_bytes 577 +#define MAYO_3_f_tail F_TAIL_96 +#define MAYO_3_f_tail_arr f_tail_96 +#define MAYO_3_salt_bytes 32 +#define MAYO_3_digest_bytes 48 +#define MAYO_3_pk_seed_bytes 16 +#define MAYO_3_sk_seed_bytes 32 + +#define MAYO_5_name "MAYO_5" +#define MAYO_5_n 133 +#define MAYO_5_m 128 +#define MAYO_5_o 12 +#define MAYO_5_v (MAYO_5_n - MAYO_5_o) +#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1) +#define MAYO_5_k 12 +#define MAYO_5_q 16 +#define MAYO_5_m_bytes 64 +#define MAYO_5_O_bytes 726 +#define MAYO_5_v_bytes 61 +#define MAYO_5_r_bytes 72 +#define MAYO_5_P1_bytes 472384 +#define MAYO_5_P2_bytes 92928 +#define MAYO_5_P3_bytes 4992 +#define MAYO_5_csk_bytes 40 +#define MAYO_5_esk_bytes 566078 +#define MAYO_5_cpk_bytes 5008 +#define MAYO_5_epk_bytes 570304 +#define MAYO_5_sig_bytes 838 +#define MAYO_5_f_tail F_TAIL_128 +#define MAYO_5_f_tail_arr f_tail_128 +#define MAYO_5_salt_bytes 40 +#define MAYO_5_digest_bytes 64 +#define MAYO_5_pk_seed_bytes 16 +#define MAYO_5_sk_seed_bytes 40 + +#define PARAM_JOIN2_(a, b) a##_##b +#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b) +#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end) + +#if defined(MAYO_VARIANT) +#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c +#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c) +#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s) + +#if defined(MAYO_BUILD_TYPE_REF) +#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s) +#elif defined(MAYO_BUILD_TYPE_OPT) +#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s) +#elif defined(MAYO_BUILD_TYPE_AVX2) +#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s) +#else +#error "Build type not known" +#endif + +#else +#define MAYO_NAMESPACE(s) s +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define NAME_MAX mayo5 +#define N_MAX 133 +#define M_MAX 128 +#define O_MAX 18 +#define V_MAX 121 +#define K_MAX 12 +#define Q_MAX 16 +#define PK_SEED_BYTES_MAX 16 +#define SK_SEED_BYTES_MAX 40 +#define SALT_BYTES_MAX 40 +#define DIGEST_BYTES_MAX 64 +#define O_BYTES_MAX 726 +#define V_BYTES_MAX 61 +#define R_BYTES_MAX 72 +#define P1_BYTES_MAX 472384 +#define P2_BYTES_MAX 92928 +#define P3_BYTES_MAX 5472 +#define SIG_BYTES_MAX 838 +#define CPK_BYTES_MAX 5488 +#define CSK_BYTES_MAX 40 +#define ESK_BYTES_MAX 566078 +#define EPK_BYTES_MAX 570304 +#define N_MINUS_O_MAX 121 +#define M_BYTES_MAX 64 +#elif defined(MAYO_VARIANT) +#define M_MAX PARAM_NAME(m) +#define N_MAX PARAM_NAME(n) +#define O_MAX PARAM_NAME(o) +#define V_MAX PARAM_NAME(v) +#define K_MAX PARAM_NAME(k) +#define Q_MAX PARAM_NAME(q) +#define M_BYTES_MAX PARAM_NAME(m_bytes) +#define O_BYTES_MAX PARAM_NAME(O_bytes) +#define V_BYTES_MAX PARAM_NAME(v_bytes) +#define R_BYTES_MAX PARAM_NAME(r_bytes) +#define P1_BYTES_MAX PARAM_NAME(P1_bytes) +#define P2_BYTES_MAX PARAM_NAME(P2_bytes) +#define P3_BYTES_MAX PARAM_NAME(P3_bytes) +#define SIG_BYTES_MAX PARAM_NAME(sig_bytes) +#define CSK_BYTES_MAX PARAM_NAME(csk_bytes) +#define ESK_BYTES_MAX PARAM_NAME(esk_bytes) +#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes) +#define EPK_BYTES_MAX PARAM_NAME(epk_bytes) +#define N_MINUS_O_MAX (N_MAX - O_MAX) +#define SALT_BYTES_MAX PARAM_NAME(salt_bytes) +#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes) +#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes) +#define SK_SEED_BYTES_MAX SALT_BYTES_MAX +#else +#error "Parameter not specified" +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define PARAM_name(p) (p->name) +#define PARAM_m(p) (p->m) +#define PARAM_n(p) (p->n) +#define PARAM_o(p) (p->o) +#define PARAM_v(p) (p->n - p->o) +#define PARAM_A_cols(p) (p->k * p->o + 1) +#define PARAM_k(p) (p->k) +#define PARAM_q(p) (p->q) +#define PARAM_m_bytes(p) (p->m_bytes) +#define PARAM_O_bytes(p) (p->O_bytes) +#define PARAM_v_bytes(p) (p->v_bytes) +#define PARAM_r_bytes(p) (p->r_bytes) +#define PARAM_P1_bytes(p) (p->P1_bytes) +#define PARAM_P2_bytes(p) (p->P2_bytes) +#define PARAM_P3_bytes(p) (p->P3_bytes) +#define PARAM_csk_bytes(p) (p->csk_bytes) +#define PARAM_esk_bytes(p) (p->esk_bytes) +#define PARAM_cpk_bytes(p) (p->cpk_bytes) +#define PARAM_epk_bytes(p) (p->epk_bytes) +#define PARAM_sig_bytes(p) (p->sig_bytes) +#define PARAM_f_tail(p) (p->f_tail) +#define PARAM_salt_bytes(p) (p->salt_bytes) +#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes) +#define PARAM_digest_bytes(p) (p->digest_bytes) +#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes) +#elif defined(MAYO_VARIANT) +#define PARAM_name(p) PARAM_NAME(name) +#define PARAM_m(p) PARAM_NAME(m) +#define PARAM_n(p) PARAM_NAME(n) +#define PARAM_o(p) PARAM_NAME(o) +#define PARAM_v(p) PARAM_NAME(v) +#define PARAM_A_cols(p) PARAM_NAME(A_cols) +#define PARAM_k(p) PARAM_NAME(k) +#define PARAM_q(p) PARAM_NAME(q) +#define PARAM_m_bytes(p) PARAM_NAME(m_bytes) +#define PARAM_O_bytes(p) PARAM_NAME(O_bytes) +#define PARAM_v_bytes(p) PARAM_NAME(v_bytes) +#define PARAM_r_bytes(p) PARAM_NAME(r_bytes) +#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes) +#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes) +#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes) +#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes) +#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes) +#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes) +#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes) +#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes) +static const unsigned char f_tail[] = PARAM_NAME(f_tail); +#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes) +#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes) +#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes) +#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes) +#define PARAM_f_tail(p) f_tail +#else +#error "Parameter not specified" +#endif + +/** + * Struct defining MAYO parameters + */ +typedef struct { + int m; + int n; + int o; + int k; + int q; + const unsigned char *f_tail; + int m_bytes; + int O_bytes; + int v_bytes; + int r_bytes; + int R_bytes; + int P1_bytes; + int P2_bytes; + int P3_bytes; + int csk_bytes; + int esk_bytes; + int cpk_bytes; + int epk_bytes; + int sig_bytes; + int salt_bytes; + int sk_seed_bytes; + int digest_bytes; + int pk_seed_bytes; + const char *name; +} mayo_params_t; + +typedef struct sk_t { + uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8]; + uint8_t o[O_BYTES_MAX]; +} sk_t; + +/** + * MAYO parameter sets + */ +#ifdef ENABLE_PARAMS_DYNAMIC +extern const mayo_params_t MAYO_1; +extern const mayo_params_t MAYO_2; +extern const mayo_params_t MAYO_3; +extern const mayo_params_t MAYO_5; +#endif + +/** + * Status codes + */ +#define MAYO_OK 0 +#define MAYO_ERR 1 + +/** + * Mayo keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * @param[in] p Mayo parameter set + * @param[out] pk Mayo public key + * @param[out] sk Mayo secret key + * @return int status code + */ +#define mayo_keypair MAYO_NAMESPACE(mayo_keypair) +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk); + +#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature) +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk); + +/** + * MAYO signature generation. + * + * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec. + * Keys provided is a compacted secret keys. + * The caller is responsible to allocate sufficient memory to hold sm. + * + * @param[in] p Mayo parameter set + * @param[out] sm Signature concatenated with message + * @param[out] smlen Pointer to the length of sm + * @param[in] m Message to be signed + * @param[in] mlen Message length + * @param[in] sk Compacted secret key + * @return int status code + */ +#define mayo_sign MAYO_NAMESPACE(mayo_sign) +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +/** + * Mayo open signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m. + * Keys provided is a compact public key. + * The caller is responsible to allocate sufficient memory to hold m. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sm Signature concatenated with message + * @param[in] smlen Length of sm + * @param[in] pk Compacted public key + * @return int status code + */ +#define mayo_open MAYO_NAMESPACE(mayo_open) +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk); + +/** + * Mayo compact keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and + * cpk are compact representations of a Mayo secret key and public key + * + * @param[in] p Mayo parameter set + * @param[out] cpk Mayo compacted public key + * @param[out] csk Mayo compacted secret key + * @return int status code + */ +#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact) +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk); + +/** + * Mayo expand public key. + * + * The implementation corresponds to Mayo.expandPK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold epk. + * + * @param[in] p Mayo parameter set + * @param[in] cpk Compacted public key. + * @param[out] epk Expanded public key. + * @return int return code + */ +#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk) +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *epk); + +/** + * Mayo expand secret key. + * + * The implementation corresponds to Mayo.expandSK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold esk. + * + * @param[in] p Mayo parameter set + * @param[in] csk Compacted secret key. + * @param[out] esk Expanded secret key. + * @return int return code + */ +#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk) +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *esk); + +/** + * Mayo verify signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1. + * Keys provided is a compact public key. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sig Signature + * @param[in] pk Compacted public key + * @return int 0 if verification succeeded, 1 otherwise. + */ +#define mayo_verify MAYO_NAMESPACE(mayo_verify) +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *pk); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/mem.h b/src/sig/mayo/pqmayo_mayo-2_opt/mem.h new file mode 100644 index 0000000000..2aa2196e2e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/mem.h @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MEM_H +#define MEM_H +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#define BSWAP32(i) __builtin_bswap32((i)) +#define BSWAP64(i) __builtin_bswap64((i)) +#else +#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24)) +#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)) +#endif + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint32_t ct_is_greater_than(int a, int b) { + int32_t diff = b - a; + return (uint32_t) (diff >> (8*sizeof(uint32_t)-1)); +} + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint64_t ct_64_is_greater_than(int a, int b) { + int64_t diff = ((int64_t) b) - ((int64_t) a); + return (uint64_t) (diff >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00000000, else 0xFFFFFFFF +static inline uint32_t ct_compare_32(int a, int b) { + return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF +static inline uint64_t ct_compare_64(int a, int b) { + return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00, else 0xFF +static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) { + return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +#include +/** + * Clears and frees allocated memory. + * + * @param[out] mem Memory to be cleared and freed. + * @param size Size of memory to be cleared and freed. + */ +static inline void mayo_secure_free(void *mem, size_t size) { + OQS_MEM_secure_free(mem, size); +} + +/** + * Clears memory. + * + * @param[out] mem Memory to be cleared. + * @param size Size of memory to be cleared. + */ +static inline void mayo_secure_clear(void *mem, size_t size) { + OQS_MEM_cleanse(mem, size); +} + +#endif diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/params.c b/src/sig/mayo/pqmayo_mayo-2_opt/params.c new file mode 100644 index 0000000000..043d4cedc1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/params.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +static const unsigned char f_tail_64[] = F_TAIL_64; +static const unsigned char f_tail_96[] = F_TAIL_96; +static const unsigned char f_tail_128[] = F_TAIL_128; + +#define MAYO_GEN_PARAMS(nm) \ + const mayo_params_t nm = { \ + .m = PARAM_JOIN2(nm, m), \ + .n = PARAM_JOIN2(nm, n), \ + .o = PARAM_JOIN2(nm, o), \ + .k = PARAM_JOIN2(nm, k), \ + .q = PARAM_JOIN2(nm, q), \ + .f_tail = PARAM_JOIN2(nm, f_tail_arr), \ + .m_bytes = PARAM_JOIN2(nm, m_bytes), \ + .O_bytes = PARAM_JOIN2(nm, O_bytes), \ + .v_bytes = PARAM_JOIN2(nm, v_bytes), \ + .r_bytes = PARAM_JOIN2(nm, r_bytes), \ + .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \ + .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \ + .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \ + .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \ + .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \ + .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \ + .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \ + .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \ + .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \ + .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \ + .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \ + .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \ + .name = #nm \ + }; + +MAYO_GEN_PARAMS(MAYO_1); +MAYO_GEN_PARAMS(MAYO_2); +MAYO_GEN_PARAMS(MAYO_3); +MAYO_GEN_PARAMS(MAYO_5); +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-2_opt/simple_arithmetic.h new file mode 100644 index 0000000000..ce7580f4c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-2_opt/simple_arithmetic.h @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SIMPLE_ARITHMETIC_H +#define SIMPLE_ARITHMETIC_H + +// GF(16) multiplication mod x^4 + x + 1 +static inline unsigned char mul_f(unsigned char a, unsigned char b) { + // carryless multiply + unsigned char p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + unsigned char top_p = p & 0xf0; + unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f; + return out; +} + +static inline uint64_t mul_fx8(unsigned char a, uint64_t b) { + // carryless multiply + uint64_t p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0; + uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f; + return out; +} + +// GF(16) addition +static inline unsigned char add_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) subtraction +static inline unsigned char sub_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) negation +static inline unsigned char neg_f(unsigned char a) { + return a; +} + +static inline unsigned char inverse_f(unsigned char a) { + // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5, + // 10, 4, 3, 8}; return table[a & 15]; + + unsigned char a2 = mul_f(a, a); + unsigned char a4 = mul_f(a2, a2); + unsigned char a8 = mul_f(a4, a4); + unsigned char a6 = mul_f(a2, a4); + unsigned char a14 = mul_f(a8, a6); + + return a14; +} + +static inline unsigned char lincomb(const unsigned char *a, + const unsigned char *b, int n, int m) { + unsigned char ret = 0; + for (int i = 0; i < n; ++i, b += m) { + ret = add_f(mul_f(a[i], *b), ret); + } + return ret; +} + +static inline void mat_mul(const unsigned char *a, const unsigned char *b, + unsigned char *c, int colrow_ab, int row_a, int col_b) { + for (int i = 0; i < row_a; ++i, a += colrow_ab) { + for (int j = 0; j < col_b; ++j, ++c) { + *c = lincomb(a, b + j, colrow_ab, col_b); + } + } +} + +static inline void mat_add(const unsigned char *a, const unsigned char *b, + unsigned char *c, int m, int n) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j)); + } + } +} + +static +inline void m_vec_copy(int m_legs, const uint64_t *in, + uint64_t *out) { + for (int i = 0; i < m_legs * 2; i++) { + out[i] = in[i]; + } +} + +static +inline void m_vec_add(int m_legs, const uint64_t *in, + uint64_t *acc) { + for (int i = 0; i < m_legs * 2; i++) { + acc[i] ^= in[i]; + } +} + +// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1) +// gf16 := gf2[x]/(x^4+x+1) + +static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) { + uint32_t a_msb; + uint32_t a32 = a; + uint32_t b32 = b; + uint32_t r32 = a32 * (b32 & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 1) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 2) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 3) & 1); + + return r32; + +} + +static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < m_legs*2;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) { + for(int i=0;i +#include + +void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output); +#define AES_ECB_encrypt AES_256_ECB + +#ifdef ENABLE_AESNI +int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +#define AES_128_CTR AES_128_CTR_NI +#else +#include +static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen) { + (void) inputByteLen; + uint8_t iv[12] = { 0 }; + aes128ctr_prf(output, outputByteLen, input, iv); + return (int) outputByteLen; +} +#endif + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/api.c b/src/sig/mayo/pqmayo_mayo-3_avx2/api.c new file mode 100644 index 0000000000..5c42eabc48 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/api.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +#define MAYO_PARAMS &MAYO_3 +#else +#define MAYO_PARAMS 0 +#endif + +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { + return mayo_keypair(MAYO_PARAMS, pk, sk); +} + +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk) { + return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk); +} + +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk) { + return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk); +} + +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk) { + return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk); +} + +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk) { + if (siglen != CRYPTO_BYTES) + return -1; + return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk); +} + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/api.h b/src/sig/mayo/pqmayo_mayo-3_avx2/api.h new file mode 100644 index 0000000000..b08c24704e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/api.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef api_h +#define api_h + +#include + +#define CRYPTO_SECRETKEYBYTES 32 +#define CRYPTO_PUBLICKEYBYTES 2656 +#define CRYPTO_BYTES 577 + +#define CRYPTO_ALGNAME "MAYO-3" + +#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair) +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +#define crypto_sign MAYO_NAMESPACE(crypto_sign) +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk); + +#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature) +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open) +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk); + +#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify) +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk); + +#endif /* api_h */ + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.c new file mode 100644 index 0000000000..de09954c8a --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { +#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64) + mayo12_m_upper(m_legs, in, out, size); +#else + int m_vecs_stored = 0; + for (int r = 0; r < size; r++) { + for (int c = r; c < size; c++) { + m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored ); + if (r != c) { + m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored ); + } + m_vecs_stored ++; + } + } +#endif +} + +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){ +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + (void) p; + mayo_12_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + (void) p; + mayo_3_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + (void) p; + mayo_5_P1P1t_times_O(P1, O, acc); +#else + #ifndef MAYO_VARIANT + const int m_legs = PARAM_m(p) / 32; + #else + (void) p; + #endif + const int param_o = PARAM_o(p); + const int param_v = PARAM_n(p) - PARAM_o(p);; + + int + bs_mat_entries_used = 0; + for (int r = 0; r < param_v; r++) { + for (int c = r; c < param_v; c++) { + if(c==r) { + bs_mat_entries_used += 1; + continue; + } + for (int k = 0; k < param_o; k += 1) { + +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k)); + vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k)); + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k)); + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k)); +#else + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k)); + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k)); +#endif + + } + bs_mat_entries_used += 1; + } + } +#endif +} + +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_12_Vt_times_L_avx2(L, V_multabs, M); + mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_3_Vt_times_L_avx2(L, V_multabs, M); + mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_5_Vt_times_L_avx2(L, V_multabs, M); + mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mul_add_mat_x_m_mat(param_m / 32, V, L, M, + param_k, param_n - param_o, param_o); + + mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1); + + mul_add_mat_x_m_mat(param_m / 32, V, Pv, + Y, param_k, param_n - param_o, + param_k); +#endif +} + +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int m_legs = PARAM_m(p) / 32; + mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1); + mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o); +#endif +} + + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ] +// [ P3*S2 = P2 ] +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS) { + (void) m; +#if MAYO_AVX + const int n = o + v; + + /* Old approach which is constant time but doesn't have to be */ + unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K + unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + +#if M_MAX == 64 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 96 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 128 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS); + mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#else + NOT IMPLEMENTED +#endif +#else + const int n = o + v; + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS); + mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS); +#endif +} + + +// sample a solution x to Ax = y, with r used as randomness +// require: +// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are +// not important, they will be overwritten by y) in row major order +// - y is a vector with m elements +// - r and x are k*o bytes long +// return: 1 on success, 0 on failure +int sample_solution(const mayo_params_t *p, unsigned char *A, + const unsigned char *y, const unsigned char *r, + unsigned char *x, int k, int o, int m, int A_cols) { + #ifdef MAYO_VARIANT + (void) p; + #endif + unsigned char finished; + int col_upper_bound; + unsigned char correct_column; + + // x <- r + for (int i = 0; i < k * o; i++) { + x[i] = r[i]; + } + + // compute Ar; + unsigned char Ar[M_MAX]; + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = 0; // clear last col of A + } + mat_mul(A, r, Ar, k * o + 1, m, 1); + + // move y - Ar to last column of matrix A + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]); + } + + EF(A, m, k * o + 1); + + // check if last row of A (excluding the last entry of y) is zero + unsigned char full_rank = 0; + for (int i = 0; i < A_cols - 1; i++) { + full_rank |= A[(m - 1) * A_cols + i]; + } + +// It is okay to leak if we need to restart or not +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1); +#endif + + if (full_rank == 0) { + return 0; + } + + // back substitution in constant time + // the index of the first nonzero entry in each row is secret, which makes + // things less efficient + + for (int row = m - 1; row >= 0; row--) { + finished = 0; + col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o); + // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32} + + for (int col = row; col <= col_upper_bound; col++) { + + // Compare two chars in constant time. + // Returns 0x00 if the byte arrays are equal, 0xff otherwise. + correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished; + + unsigned char u = correct_column & A[row * A_cols + A_cols - 1]; + x[col] ^= u; + + for (int i = 0; i < row; i += 8) { + uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8) + ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24) + ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40) + ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56); + + tmp = mul_fx8(u, tmp); + + A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf; + A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf; + A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf; + A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf; + A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf; + A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf; + A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf; + A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf; + } + + finished = finished | correct_column; + } + } + return 1; +} + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.h new file mode 100644 index 0000000000..c2fb4fe334 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.h @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_H +#define ARITHMETIC_H + +#include +#include +#include + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define TARGET_BIG_ENDIAN +#endif + +#if defined(MAYO_AVX) && (M_MAX == 64) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 96) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 128) + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128)) + #include +#endif + +// Calculate P3 = O^T * (P1*O + P2) in KeyGen +#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2) +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3); + +// Calculate Upper in KeyGen +#define m_upper MAYO_NAMESPACE(m_upper) +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size); + +// Calculate acc = (P1+P1^T)*O in expand_sk +#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O) +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc); + +// Calculate M=V*L and Y=V*P1*V^T in Sign +#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt) +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y); + +// Sample solution in Sign +#define sample_solution MAYO_NAMESPACE(sample_solution) +int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols); + +// Calculate SPS = S*P*S^T in Verify +#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS) +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_128.h new file mode 100644 index 0000000000..27b367e940 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_128.h @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_128_H +#define ARITHMETIC_128_H + +#include +#include +#include + +// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_128(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; +} + + +static +inline void vec_add_128(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; + acc[6] ^= in[6]; + acc[7] ^= in[7]; +} + +inline +static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} +inline +static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static + inline void multiply_bins_128(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8); + m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8); + m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8); + m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8); + m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8); + m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8); + m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8); + m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8); + m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8); + m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8); + m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8); + m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8); + m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8); + m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8); + vec_copy_128(bins + 8, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_64.h new file mode 100644 index 0000000000..9f7535c878 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_64.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_64_H +#define ARITHMETIC_64_H + +#include +#include +#include + +// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_64(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; +} + +static +inline void vec_add_64(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; +} + +static +inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +inline +static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} +inline +static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void multiply_bins_64(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4); + m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4); + m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4); + m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4); + m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4); + m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4); + m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4); + m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4); + m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4); + m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4); + m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4); + m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4); + m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4); + m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4); + vec_copy_64(bins + 4, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_96.h new file mode 100644 index 0000000000..86359679fb --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_96.h @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_96_H +#define ARITHMETIC_96_H + +#include +#include +#include + +// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_96(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; +} + +static +inline void vec_add_96(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; +} + +inline +static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} + +inline +static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static +inline void multiply_bins_96(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6); + m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6); + m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6); + m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6); + m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6); + m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6); + m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6); + m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6); + m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6); + m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6); + m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6); + m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6); + m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6); + m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6); + vec_copy_96(bins + 6, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_common.h new file mode 100644 index 0000000000..eeb13dc0bd --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_common.h @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_COMMON_H +#define ARITHMETIC_COMMON_H + +#include +#include + +#define K_OVER_2 ((K_MAX+1)/2) + +static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, + 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, + 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01 +}; + +// +// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper! +// +static inline __m256i tbl32_gf16_multab2( uint8_t b ) { + + __m256i bx = _mm256_set1_epi16( b & 0xf ); + __m256i b1 = _mm256_srli_epi16( bx, 1 ); + + const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0)); + const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1)); + const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2)); + const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3)); + + __m256i mask_1 = _mm256_set1_epi16(1); + __m256i mask_4 = _mm256_set1_epi16(4); + __m256i mask_0 = _mm256_setzero_si256(); + + return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) ) + ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) ) + ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) ) + ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) ); +} + +static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) { + return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f); +} + +static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) { + __m256i multab_l = tbl32_gf16_multab2( b ); + __m256i multab_h = _mm256_slli_epi16( multab_l, 4 ); + + return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) ); +} + +static +inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){ + // build multiplication tables + for (size_t r = 0; r < V_MAX; r++) + { + for (size_t c = 0; c < O_MAX; c+=2) + { + O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4); + } + } +} + + +static +inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){ + // build multiplication tables + size_t r; + for (size_t c = 0; c < V_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4); + } +#if K_MAX % 2 == 1 + V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]); +#endif + } +} + +static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = { + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, + 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, + 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, + 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, + 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, + 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, + 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, + 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, + 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, + 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, + 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, + 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, + 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, + 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, + 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, + 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, + 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, + 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, + 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, + 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, + 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, + 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, + 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, + 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, + 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, + 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, + 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a, + 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a}; + + +static +inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) { + size_t r; + for (size_t c = 0; c < V_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])) + ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4); + } +#if K_MAX % 2 == 1 + S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])); +#endif + } +} + +static +inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) { + // build multiplication tables + size_t r; + for (size_t c = 0; c < O_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) + ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4); + } +#if K_MAX % 2 == 1 + S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ; +#endif + } +} + +static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) { + uint64_t mask_msb = 0x8888888888888888ULL; + uint64_t a_msb; + uint64_t a64 = a; + uint64_t b32 = b; + uint64_t r64 = a64 * (b32 & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 1) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 2) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 3) & 1); + + return r64; +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form.h new file mode 100644 index 0000000000..fa69de0ab2 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form.h @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + + +#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) + + +// +// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/. +// +static inline __m256i tbl32_gf16_multab( uint8_t b ) { + __m256i bx = _mm256_set1_epi16( b & 0xf ); + __m256i b1 = _mm256_srli_epi16( bx, 1 ); + + const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0)); + const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1)); + const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2)); + const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3)); + + __m256i mask_1 = _mm256_set1_epi16(1); + __m256i mask_4 = _mm256_set1_epi16(4); + __m256i mask_0 = _mm256_setzero_si256(); + + return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) ) + ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) ) + ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) ) + ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) ); +} + +/* put matrix in row echelon form with ones on first nonzero entries in constant time*/ +static inline void EF(unsigned char *A, int _nrows, int _ncols) { + + (void) _nrows; + (void) _ncols; + + #define nrows M_MAX + #define ncols (K_MAX * O_MAX + 1) + + #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32) + #define MAX_COLS (AVX_REGS_PER_ROW * 32) + + __m256i _pivot_row[AVX_REGS_PER_ROW]; + __m256i A_avx[AVX_REGS_PER_ROW* M_MAX]; + + unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row; + unsigned char* A_bytes = (unsigned char*) A_avx; + + // load A in the tail of AVX2 registers + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) + { + A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ]; + } + } + + // pivot row is secret, pivot col is not + unsigned char inverse; + int pivot_row = 0; + int pivot_col = MAYO_MAX(MAX_COLS - ncols,0); + for (; pivot_col < MAX_COLS-128; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-96; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-64; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-32; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS; pivot_col++) { + #include "echelon_form_loop.h" + } + + // write the matrix A back + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) { + A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j]; + } + } + mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32); + mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows); +} + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form_loop.h new file mode 100644 index 0000000000..b8b29741c4 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form_loop.h @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 + +int pivot_col_rounded = pivot_col/32; + +int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS); +int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols); +/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/ + +/* zero out pivot row */ +for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) { + _pivot_row[i] = _mm256_set1_epi8(0); +} + +/* try to get a pivot row in constant time */ +unsigned char pivot = 0; +uint32_t pivot_is_zero = -1; +for (int row = pivot_row_lower_bound; + row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) { + uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row); + uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row); + __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) ); + for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j]; + } + pivot = pivot_row_bytes[pivot_col]; + pivot_is_zero = ~ct_compare_32((int) pivot, 0); +} + +/* multiply pivot row by inverse of pivot */ +inverse = inverse_f(pivot); +__m256i inverse_multab = tbl32_gf16_multab(inverse); + +for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]); +} + +/* conditionally write pivot row to the correct row, if there is a nonzero pivot */ +/* eliminate entries below pivot */ +for (int row = pivot_row_lower_bound; row < nrows; row++) { + unsigned char below_pivot = (unsigned char) (ct_is_greater_than(row, pivot_row)); + unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col]; + + __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim); + if (row <= pivot_row_upper_bound) { + __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero); + for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) { + A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^ + _mm256_shuffle_epi8(multab, _pivot_row[col]); + } + } else { + for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]); + } + } +} + +pivot_row += (-(int32_t)(~pivot_is_zero)); + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.c new file mode 100644 index 0000000000..ce1ccd4c71 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define PK_PRF AES_128_CTR + +static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) { + int i; + for (i = 0; i < mdeclen / 2; ++i) { + *mdec++ = m[i] & 0xf; + *mdec++ = m[i] >> 4; + } + + if (mdeclen % 2 == 1) { + *mdec++ = m[i] & 0x0f; + } +} + +static void encode(const unsigned char *m, unsigned char *menc, int mlen) { + int i; + for (i = 0; i < mlen / 2; ++i, m += 2) { + menc[i] = (*m) | (*(m + 1) << 4); + } + + if (mlen % 2 == 1) { + menc[i] = (*m); + } +} + +static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *vPv = _vPv; + uint64_t temp[M_MAX/16] = {0}; + unsigned char *temp_bytes = (unsigned char *) temp; + int k = 0; + for (int i = PARAM_k(p) - 1; i >= 0 ; i--) { + for (int j = i; j < PARAM_k(p); j++) { + // multiply by X (shift up 4 bits) + unsigned char top = temp[k] >> 60; + temp[k] <<= 4; + k--; + for(; k>=0; k--){ + temp[k+1] ^= temp[k] >> 60; + temp[k] <<= 4; + } + // reduce mod f(X) + for (int jj = 0; jj < F_TAIL_LEN; jj++) { + if(jj%2 == 0){ +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#endif + } + else { +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#endif + } + } + + // extract from vPv and add + for(k=0; k < PARAM_m(p)/16; k ++){ + temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]); + } + k--; + } + } + + // add to y + for (int i = 0; i < PARAM_m(p); i+=2) + { +#ifdef TARGET_BIG_ENDIAN + y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + y[i] = t[i] ^ (temp_bytes[i/2] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4); +#endif + + } +} + +static void transpose_16x16_nibbles(uint64_t *M){ + static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f; + static const uint64_t even_bytes = 0x00ff00ff00ff00ff; + static const uint64_t even_2bytes = 0x0000ffff0000ffff; + static const uint64_t even_half = 0x00000000ffffffff; + + for (size_t i = 0; i < 16; i+=2) + { + uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; + M[i ] ^= t << 4; + M[i+1] ^= t; + } + + for (size_t i = 0; i < 16; i+=4) + { + uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes; + uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; + M[i ] ^= (t0 << 8); + M[i+1] ^= (t1 << 8); + M[i+2] ^= t0; + M[i+3] ^= t1; + } + + for (size_t i = 0; i < 4; i++) + { + uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes; + uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes; + + M[i ] ^= t0 << 16; + M[i+ 8] ^= t1 << 16; + M[i+ 4] ^= t0; + M[i+12] ^= t1; + } + + for (size_t i = 0; i < 8; i++) + { + uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; + M[i ] ^= t << 32; + M[i+8] ^= t; + } +} + +#define MAYO_M_OVER_8 ((M_MAX + 7) / 8) + +static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *VtL = _VtL; + int bits_to_shift = 0; + int words_to_shift = 0; + uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0}; + size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16; + const uint64_t *Mi, *Mj; + + for (int i = 0; i <= PARAM_k(p) - 1; ++i) { + for (int j = PARAM_k(p) - 1; j >= i; --j) { + // add the M_i and M_j to A, shifted "down" by l positions + Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + + if (i != j) { + Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + } + + bits_to_shift += 4; + if(bits_to_shift == 64){ + words_to_shift ++; + bits_to_shift = 0; + } + } + } + + for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16) + { + transpose_16x16_nibbles(A + c); + } + + unsigned char tab[F_TAIL_LEN*4] = {0}; + for (size_t i = 0; i < F_TAIL_LEN; i++) + { + tab[4*i] = mul_f(PARAM_f_tail(p)[i],1); + tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2); + tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4); + tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8); + } + + uint64_t low_bit_in_nibble = 0x1111111111111111; + + for (size_t c = 0; c < A_width; c+= 16) + { + for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++) + { + size_t pos = (r/16)*A_width + c + (r%16); + uint64_t t0 = A[pos] & low_bit_in_nibble; + uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble; + uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble; + uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble; + for (size_t t = 0; t < F_TAIL_LEN; t++) + { + A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3]; + } + } + } + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) + A[i] = BSWAP64(A[i]); +#endif + + for (int r = 0; r < PARAM_m(p); r+=16) + { + for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16) + { + for (size_t i = 0; i < 16; i++) + { + decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c)); + } + } + } +} + +// Public API + +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) { + int ret = 0; + + ret = mayo_keypair_compact(p, pk, sk); + if (ret != MAYO_OK) { + goto err; + } + +err: + return ret; +} + +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data + unsigned char y[M_MAX]; // secret data + unsigned char salt[SALT_BYTES_MAX]; // not secret data + unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX], + Vdec[N_MINUS_O_MAX * K_MAX]; // secret data + unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data + unsigned char x[K_MAX * N_MAX]; // not secret data + unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data + unsigned char s[K_MAX * N_MAX]; // not secret data + const unsigned char *seed_sk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data + alignas(32) sk_t sk; // secret data + unsigned char Ox[N_MINUS_O_MAX]; // secret data + // unsigned char Mdigest[DIGEST_BYTES]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1]; + unsigned char *ctrbyte; + unsigned char *vi; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_v_bytes = PARAM_v_bytes(p); + const int param_r_bytes = PARAM_r_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P2_bytes = PARAM_P2_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_A_cols = PARAM_A_cols(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + ret = mayo_expand_sk(p, csk, &sk); + if (ret != MAYO_OK) { + goto err; + } + + seed_sk = csk; + decode(sk.o, O, (param_n - param_o) * param_o); + + // hash message + shake256(tmp, param_digest_bytes, m, mlen); + + uint64_t *P1 = sk.p; + uint64_t *L = P1 + (param_P1_bytes/8); + alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0}; + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + L[i] = BSWAP64(L[i]); + } +#endif + + // choose the randomizer + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(tmp + param_digest_bytes, param_salt_bytes); + #else + if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // hashing to salt + memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk, + param_sk_seed_bytes); + shake256(salt, param_salt_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret +#endif + + // hashing to t + memcpy(tmp + param_digest_bytes, salt, param_salt_bytes); + ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes; + + shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + + decode(tenc, t, param_m); // may not be necessary + + for (int ctr = 0; ctr <= 255; ++ctr) { + *ctrbyte = (unsigned char)ctr; + + shake256(V, param_k * param_v_bytes + param_r_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1); + + // decode the v_i vectors + for (int i = 0; i <= param_k - 1; ++i) { + decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o), + param_n - param_o); + } + + // compute all the V * L matrices. + // compute all the V * P1 * V^T matrices. + alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0}; + V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y); + + compute_rhs(p, Y, t, y); + compute_A(p, Mtmp, A); + + decode(V + param_k * param_v_bytes, r, + param_k * + param_o); + if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) { + break; + } else { + memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t)); + } + } + + // s is already 0 + // TODO: optimize this? + for (int i = 0; i <= param_k - 1; ++i) { + vi = Vdec + i * (param_n - param_o); + mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1); + mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1); + memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o); + } + encode(s, sig, param_n * param_k); + memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes); + *siglen = param_sig_bytes; +err: + mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX); + mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX); + mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1)); + mayo_secure_clear(r, K_MAX * O_MAX + 1); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(&sk, sizeof(sk_t)); + mayo_secure_clear(Ox, N_MINUS_O_MAX); + mayo_secure_clear(tmp, + DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1); + return ret; +} + +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + const int param_sig_bytes = PARAM_sig_bytes(p); + size_t siglen = param_sig_bytes; + ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk); + if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes) + goto err; + + memmove(sm + param_sig_bytes, m, mlen); + *smlen = siglen + mlen; +err: + return ret; +} + +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk) { + const int param_sig_bytes = PARAM_sig_bytes(p); + if (smlen < (size_t)param_sig_bytes) { + return MAYO_ERR; + } + int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm, + pk); + + if (result == MAYO_OK) { + *mlen = smlen - param_sig_bytes; + memmove(m, sm + param_sig_bytes, *mlen); + } + + return result; +} + +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk) { + int ret = MAYO_OK; + unsigned char *seed_sk = csk; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8]; + alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0}; + + unsigned char *seed_pk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_m = PARAM_m(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + // seed_sk $←- B^(sk_seed bytes) + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(seed_sk, param_sk_seed_bytes); + #else + if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // S ← shake256(seedsk, pk seed bytes + O bytes) + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + // seed_pk ← s[0 : pk_seed_bytes] + seed_pk = S; + + // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes]) + decode(S + param_pk_seed_bytes, O, param_v * param_o); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + + int m_legs = param_m / 32; + + uint64_t *P1 = P; + uint64_t *P1O_P2 = P + (param_P1_bytes / 8); + + // compute P3 = O^t * (P1*O + P2) + Ot_times_P1O_P2(p, P1, O, P1O_P2, P3); + + // store seed_pk in cpk + memcpy(cpk, seed_pk, param_pk_seed_bytes); + + alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8]; + + // compute Upper(P3) and store in cpk + m_upper(m_legs, P3, P3_upper, param_o); + + memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes); + + +#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL) +err: +#endif + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2); + return ret; +} + +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *pk) { + #ifdef MAYO_VARIANT + (void)p; + #endif + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes); + pk += param_P1_bytes + param_P2_bytes; + memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes); + return MAYO_OK; +} + +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *sk) { + int ret = MAYO_OK; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + uint64_t *P = sk->p; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_o = PARAM_o(p); + const int param_v = PARAM_v(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + const unsigned char *seed_sk = csk; + unsigned char *seed_pk = S; + + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + decode(S + param_pk_seed_bytes, O, + param_v * param_o); // O = S + PK_SEED_BYTES; + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + uint64_t *P2 = P + (param_P1_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + uint64_t *P1 = P; + // compute L_i = (P1 + P1^t)*O + P2 + uint64_t *L = P2; + P1P1t_times_O(p, P1, O, L); + + // write to sk + memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + return ret; +} + +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *cpk) { + unsigned char tEnc[M_BYTES_MAX]; + unsigned char t[M_MAX]; + unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X) + unsigned char s[K_MAX * N_MAX]; + alignas (64) uint64_t pk[EPK_BYTES_MAX / 8]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX]; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P3_bytes = PARAM_P3_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk); + if (ret != MAYO_OK) { + return MAYO_ERR; + } + + uint64_t *P1 = pk; + uint64_t *P2 = pk + (param_P1_bytes / 8); + uint64_t *P3 = P2 + (param_P2_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + P2[i] = BSWAP64(P2[i]); + } + for (int i = 0; i < param_P3_bytes / 8; ++i) { + P3[i] = BSWAP64(P3[i]); + } +#endif + + // hash m + shake256(tmp, param_digest_bytes, m, mlen); + + // compute t + memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes, + param_salt_bytes); + shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + decode(tEnc, t, param_m); + + // decode s + decode(sig, s, param_k * param_n); + + // Compute S*P*S^T + alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0}; + + m_calculate_PS_SPS(P1, P2, P3, s, param_m, + param_v, param_o, param_k, SPS); + + // combine the vectors in SPS and reduce mod f(X) + compute_rhs(p, SPS, y, y); + + if (memcmp(y, t, param_m) == 0) { + return MAYO_OK; // good signature + } + return MAYO_ERR; // bad signature +} + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.h new file mode 100644 index 0000000000..1de4bf2a33 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.h @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MAYO_H +#define MAYO_H + +#include +#include + +#define F_TAIL_LEN 5 +#define F_TAIL_64 \ + { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3 +#define F_TAIL_96 \ + { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x +#define F_TAIL_128 \ + { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2 + +#define MAYO_1_name "MAYO_1" +#define MAYO_1_n 66 +#define MAYO_1_m 64 +#define MAYO_1_o 8 +#define MAYO_1_v (MAYO_1_n - MAYO_1_o) +#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1) +#define MAYO_1_k 9 +#define MAYO_1_q 16 +#define MAYO_1_m_bytes 32 +#define MAYO_1_O_bytes 232 +#define MAYO_1_v_bytes 29 +#define MAYO_1_r_bytes 36 +#define MAYO_1_P1_bytes 54752 +#define MAYO_1_P2_bytes 14848 +#define MAYO_1_P3_bytes 1152 +#define MAYO_1_csk_bytes 24 +#define MAYO_1_esk_bytes 69856 +#define MAYO_1_cpk_bytes 1168 +#define MAYO_1_epk_bytes 70752 +#define MAYO_1_sig_bytes 321 +#define MAYO_1_f_tail F_TAIL_64 +#define MAYO_1_f_tail_arr f_tail_64 +#define MAYO_1_salt_bytes 24 +#define MAYO_1_digest_bytes 32 +#define MAYO_1_pk_seed_bytes 16 +#define MAYO_1_sk_seed_bytes 24 + +#define MAYO_2_name "MAYO_2" +#define MAYO_2_n 78 +#define MAYO_2_m 64 +#define MAYO_2_o 18 +#define MAYO_2_v (MAYO_2_n - MAYO_2_o) +#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1) +#define MAYO_2_k 4 +#define MAYO_2_q 16 +#define MAYO_2_m_bytes 32 +#define MAYO_2_O_bytes 540 +#define MAYO_2_v_bytes 30 +#define MAYO_2_r_bytes 36 +#define MAYO_2_P1_bytes 58560 +#define MAYO_2_P2_bytes 34560 +#define MAYO_2_P3_bytes 5472 +#define MAYO_2_csk_bytes 24 +#define MAYO_2_esk_bytes 93684 +#define MAYO_2_cpk_bytes 5488 +#define MAYO_2_epk_bytes 98592 +#define MAYO_2_sig_bytes 180 +#define MAYO_2_f_tail F_TAIL_64 +#define MAYO_2_f_tail_arr f_tail_64 +#define MAYO_2_salt_bytes 24 +#define MAYO_2_digest_bytes 32 +#define MAYO_2_pk_seed_bytes 16 +#define MAYO_2_sk_seed_bytes 24 + +#define MAYO_3_name "MAYO_3" +#define MAYO_3_n 99 +#define MAYO_3_m 96 +#define MAYO_3_o 10 +#define MAYO_3_v (MAYO_3_n - MAYO_3_o) +#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1) +#define MAYO_3_k 11 +#define MAYO_3_q 16 +#define MAYO_3_m_bytes 48 +#define MAYO_3_O_bytes 445 +#define MAYO_3_v_bytes 45 +#define MAYO_3_r_bytes 55 +#define MAYO_3_P1_bytes 192240 +#define MAYO_3_P2_bytes 42720 +#define MAYO_3_P3_bytes 2640 +#define MAYO_3_csk_bytes 32 +#define MAYO_3_esk_bytes 235437 +#define MAYO_3_cpk_bytes 2656 +#define MAYO_3_epk_bytes 237600 +#define MAYO_3_sig_bytes 577 +#define MAYO_3_f_tail F_TAIL_96 +#define MAYO_3_f_tail_arr f_tail_96 +#define MAYO_3_salt_bytes 32 +#define MAYO_3_digest_bytes 48 +#define MAYO_3_pk_seed_bytes 16 +#define MAYO_3_sk_seed_bytes 32 + +#define MAYO_5_name "MAYO_5" +#define MAYO_5_n 133 +#define MAYO_5_m 128 +#define MAYO_5_o 12 +#define MAYO_5_v (MAYO_5_n - MAYO_5_o) +#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1) +#define MAYO_5_k 12 +#define MAYO_5_q 16 +#define MAYO_5_m_bytes 64 +#define MAYO_5_O_bytes 726 +#define MAYO_5_v_bytes 61 +#define MAYO_5_r_bytes 72 +#define MAYO_5_P1_bytes 472384 +#define MAYO_5_P2_bytes 92928 +#define MAYO_5_P3_bytes 4992 +#define MAYO_5_csk_bytes 40 +#define MAYO_5_esk_bytes 566078 +#define MAYO_5_cpk_bytes 5008 +#define MAYO_5_epk_bytes 570304 +#define MAYO_5_sig_bytes 838 +#define MAYO_5_f_tail F_TAIL_128 +#define MAYO_5_f_tail_arr f_tail_128 +#define MAYO_5_salt_bytes 40 +#define MAYO_5_digest_bytes 64 +#define MAYO_5_pk_seed_bytes 16 +#define MAYO_5_sk_seed_bytes 40 + +#define PARAM_JOIN2_(a, b) a##_##b +#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b) +#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end) + +#if defined(MAYO_VARIANT) +#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c +#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c) +#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s) + +#if defined(MAYO_BUILD_TYPE_REF) +#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s) +#elif defined(MAYO_BUILD_TYPE_OPT) +#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s) +#elif defined(MAYO_BUILD_TYPE_AVX2) +#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s) +#else +#error "Build type not known" +#endif + +#else +#define MAYO_NAMESPACE(s) s +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define NAME_MAX mayo5 +#define N_MAX 133 +#define M_MAX 128 +#define O_MAX 18 +#define V_MAX 121 +#define K_MAX 12 +#define Q_MAX 16 +#define PK_SEED_BYTES_MAX 16 +#define SK_SEED_BYTES_MAX 40 +#define SALT_BYTES_MAX 40 +#define DIGEST_BYTES_MAX 64 +#define O_BYTES_MAX 726 +#define V_BYTES_MAX 61 +#define R_BYTES_MAX 72 +#define P1_BYTES_MAX 472384 +#define P2_BYTES_MAX 92928 +#define P3_BYTES_MAX 5472 +#define SIG_BYTES_MAX 838 +#define CPK_BYTES_MAX 5488 +#define CSK_BYTES_MAX 40 +#define ESK_BYTES_MAX 566078 +#define EPK_BYTES_MAX 570304 +#define N_MINUS_O_MAX 121 +#define M_BYTES_MAX 64 +#elif defined(MAYO_VARIANT) +#define M_MAX PARAM_NAME(m) +#define N_MAX PARAM_NAME(n) +#define O_MAX PARAM_NAME(o) +#define V_MAX PARAM_NAME(v) +#define K_MAX PARAM_NAME(k) +#define Q_MAX PARAM_NAME(q) +#define M_BYTES_MAX PARAM_NAME(m_bytes) +#define O_BYTES_MAX PARAM_NAME(O_bytes) +#define V_BYTES_MAX PARAM_NAME(v_bytes) +#define R_BYTES_MAX PARAM_NAME(r_bytes) +#define P1_BYTES_MAX PARAM_NAME(P1_bytes) +#define P2_BYTES_MAX PARAM_NAME(P2_bytes) +#define P3_BYTES_MAX PARAM_NAME(P3_bytes) +#define SIG_BYTES_MAX PARAM_NAME(sig_bytes) +#define CSK_BYTES_MAX PARAM_NAME(csk_bytes) +#define ESK_BYTES_MAX PARAM_NAME(esk_bytes) +#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes) +#define EPK_BYTES_MAX PARAM_NAME(epk_bytes) +#define N_MINUS_O_MAX (N_MAX - O_MAX) +#define SALT_BYTES_MAX PARAM_NAME(salt_bytes) +#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes) +#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes) +#define SK_SEED_BYTES_MAX SALT_BYTES_MAX +#else +#error "Parameter not specified" +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define PARAM_name(p) (p->name) +#define PARAM_m(p) (p->m) +#define PARAM_n(p) (p->n) +#define PARAM_o(p) (p->o) +#define PARAM_v(p) (p->n - p->o) +#define PARAM_A_cols(p) (p->k * p->o + 1) +#define PARAM_k(p) (p->k) +#define PARAM_q(p) (p->q) +#define PARAM_m_bytes(p) (p->m_bytes) +#define PARAM_O_bytes(p) (p->O_bytes) +#define PARAM_v_bytes(p) (p->v_bytes) +#define PARAM_r_bytes(p) (p->r_bytes) +#define PARAM_P1_bytes(p) (p->P1_bytes) +#define PARAM_P2_bytes(p) (p->P2_bytes) +#define PARAM_P3_bytes(p) (p->P3_bytes) +#define PARAM_csk_bytes(p) (p->csk_bytes) +#define PARAM_esk_bytes(p) (p->esk_bytes) +#define PARAM_cpk_bytes(p) (p->cpk_bytes) +#define PARAM_epk_bytes(p) (p->epk_bytes) +#define PARAM_sig_bytes(p) (p->sig_bytes) +#define PARAM_f_tail(p) (p->f_tail) +#define PARAM_salt_bytes(p) (p->salt_bytes) +#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes) +#define PARAM_digest_bytes(p) (p->digest_bytes) +#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes) +#elif defined(MAYO_VARIANT) +#define PARAM_name(p) PARAM_NAME(name) +#define PARAM_m(p) PARAM_NAME(m) +#define PARAM_n(p) PARAM_NAME(n) +#define PARAM_o(p) PARAM_NAME(o) +#define PARAM_v(p) PARAM_NAME(v) +#define PARAM_A_cols(p) PARAM_NAME(A_cols) +#define PARAM_k(p) PARAM_NAME(k) +#define PARAM_q(p) PARAM_NAME(q) +#define PARAM_m_bytes(p) PARAM_NAME(m_bytes) +#define PARAM_O_bytes(p) PARAM_NAME(O_bytes) +#define PARAM_v_bytes(p) PARAM_NAME(v_bytes) +#define PARAM_r_bytes(p) PARAM_NAME(r_bytes) +#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes) +#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes) +#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes) +#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes) +#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes) +#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes) +#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes) +#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes) +static const unsigned char f_tail[] = PARAM_NAME(f_tail); +#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes) +#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes) +#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes) +#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes) +#define PARAM_f_tail(p) f_tail +#else +#error "Parameter not specified" +#endif + +/** + * Struct defining MAYO parameters + */ +typedef struct { + int m; + int n; + int o; + int k; + int q; + const unsigned char *f_tail; + int m_bytes; + int O_bytes; + int v_bytes; + int r_bytes; + int R_bytes; + int P1_bytes; + int P2_bytes; + int P3_bytes; + int csk_bytes; + int esk_bytes; + int cpk_bytes; + int epk_bytes; + int sig_bytes; + int salt_bytes; + int sk_seed_bytes; + int digest_bytes; + int pk_seed_bytes; + const char *name; +} mayo_params_t; + +typedef struct sk_t { + uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8]; + uint8_t o[O_BYTES_MAX]; +} sk_t; + +/** + * MAYO parameter sets + */ +#ifdef ENABLE_PARAMS_DYNAMIC +extern const mayo_params_t MAYO_1; +extern const mayo_params_t MAYO_2; +extern const mayo_params_t MAYO_3; +extern const mayo_params_t MAYO_5; +#endif + +/** + * Status codes + */ +#define MAYO_OK 0 +#define MAYO_ERR 1 + +/** + * Mayo keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * @param[in] p Mayo parameter set + * @param[out] pk Mayo public key + * @param[out] sk Mayo secret key + * @return int status code + */ +#define mayo_keypair MAYO_NAMESPACE(mayo_keypair) +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk); + +#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature) +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk); + +/** + * MAYO signature generation. + * + * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec. + * Keys provided is a compacted secret keys. + * The caller is responsible to allocate sufficient memory to hold sm. + * + * @param[in] p Mayo parameter set + * @param[out] sm Signature concatenated with message + * @param[out] smlen Pointer to the length of sm + * @param[in] m Message to be signed + * @param[in] mlen Message length + * @param[in] sk Compacted secret key + * @return int status code + */ +#define mayo_sign MAYO_NAMESPACE(mayo_sign) +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +/** + * Mayo open signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m. + * Keys provided is a compact public key. + * The caller is responsible to allocate sufficient memory to hold m. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sm Signature concatenated with message + * @param[in] smlen Length of sm + * @param[in] pk Compacted public key + * @return int status code + */ +#define mayo_open MAYO_NAMESPACE(mayo_open) +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk); + +/** + * Mayo compact keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and + * cpk are compact representations of a Mayo secret key and public key + * + * @param[in] p Mayo parameter set + * @param[out] cpk Mayo compacted public key + * @param[out] csk Mayo compacted secret key + * @return int status code + */ +#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact) +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk); + +/** + * Mayo expand public key. + * + * The implementation corresponds to Mayo.expandPK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold epk. + * + * @param[in] p Mayo parameter set + * @param[in] cpk Compacted public key. + * @param[out] epk Expanded public key. + * @return int return code + */ +#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk) +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *epk); + +/** + * Mayo expand secret key. + * + * The implementation corresponds to Mayo.expandSK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold esk. + * + * @param[in] p Mayo parameter set + * @param[in] csk Compacted secret key. + * @param[out] esk Expanded secret key. + * @return int return code + */ +#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk) +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *esk); + +/** + * Mayo verify signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1. + * Keys provided is a compact public key. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sig Signature + * @param[in] pk Compacted public key + * @return int 0 if verification succeeded, 1 otherwise. + */ +#define mayo_verify MAYO_NAMESPACE(mayo_verify) +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *pk); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/mem.h b/src/sig/mayo/pqmayo_mayo-3_avx2/mem.h new file mode 100644 index 0000000000..2aa2196e2e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/mem.h @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MEM_H +#define MEM_H +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#define BSWAP32(i) __builtin_bswap32((i)) +#define BSWAP64(i) __builtin_bswap64((i)) +#else +#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24)) +#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)) +#endif + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint32_t ct_is_greater_than(int a, int b) { + int32_t diff = b - a; + return (uint32_t) (diff >> (8*sizeof(uint32_t)-1)); +} + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint64_t ct_64_is_greater_than(int a, int b) { + int64_t diff = ((int64_t) b) - ((int64_t) a); + return (uint64_t) (diff >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00000000, else 0xFFFFFFFF +static inline uint32_t ct_compare_32(int a, int b) { + return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF +static inline uint64_t ct_compare_64(int a, int b) { + return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00, else 0xFF +static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) { + return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +#include +/** + * Clears and frees allocated memory. + * + * @param[out] mem Memory to be cleared and freed. + * @param size Size of memory to be cleared and freed. + */ +static inline void mayo_secure_free(void *mem, size_t size) { + OQS_MEM_secure_free(mem, size); +} + +/** + * Clears memory. + * + * @param[out] mem Memory to be cleared. + * @param size Size of memory to be cleared. + */ +static inline void mayo_secure_clear(void *mem, size_t size) { + OQS_MEM_cleanse(mem, size); +} + +#endif diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/params.c b/src/sig/mayo/pqmayo_mayo-3_avx2/params.c new file mode 100644 index 0000000000..043d4cedc1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/params.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +static const unsigned char f_tail_64[] = F_TAIL_64; +static const unsigned char f_tail_96[] = F_TAIL_96; +static const unsigned char f_tail_128[] = F_TAIL_128; + +#define MAYO_GEN_PARAMS(nm) \ + const mayo_params_t nm = { \ + .m = PARAM_JOIN2(nm, m), \ + .n = PARAM_JOIN2(nm, n), \ + .o = PARAM_JOIN2(nm, o), \ + .k = PARAM_JOIN2(nm, k), \ + .q = PARAM_JOIN2(nm, q), \ + .f_tail = PARAM_JOIN2(nm, f_tail_arr), \ + .m_bytes = PARAM_JOIN2(nm, m_bytes), \ + .O_bytes = PARAM_JOIN2(nm, O_bytes), \ + .v_bytes = PARAM_JOIN2(nm, v_bytes), \ + .r_bytes = PARAM_JOIN2(nm, r_bytes), \ + .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \ + .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \ + .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \ + .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \ + .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \ + .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \ + .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \ + .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \ + .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \ + .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \ + .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \ + .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \ + .name = #nm \ + }; + +MAYO_GEN_PARAMS(MAYO_1); +MAYO_GEN_PARAMS(MAYO_2); +MAYO_GEN_PARAMS(MAYO_3); +MAYO_GEN_PARAMS(MAYO_5); +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_128.h new file mode 100644 index 0000000000..27b416adce --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_128.h @@ -0,0 +1,524 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_128_H +#define SHUFFLE_ARITHMETIC_128_H + +#include +#include +#include +#include + + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + cols_used ++; + __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[(2*r*O_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); + acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0; + acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1; + } + } +} + +static +inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){ + const __m256i *P1O_P2 = (__m256i *) _P1O_P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); + acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1; + } + } +} + + +static +inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + cols_used += 1; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + pos += (V_MAX -c - 1); + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k )); + __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1)); + __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2)); + __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3)); + + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k), acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1); + } + } +} + + +static +inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *L = (__m256i *) _L; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +static +inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + +static +inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *Pv = (__m256i *) _Pv; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + + // P1 * S1 + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + P1_cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1); + } + } + + // P2 * S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P3 = (__m256i *) _P3; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + for (c=r; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +static +inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc); +} + +static +inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){ + const __m256i *PS2 = (__m256i *) _PS2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_64.h new file mode 100644 index 0000000000..defff86f8f --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_64.h @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_64_H +#define SHUFFLE_ARITHMETIC_64_H + +#include +#include +#include +#include + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t; + } + } +} + + +static +inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1O_P2 = (__m256i *) _P1O_P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t; + } + } +} + +static +inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + cols_used += 1; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + pos); + pos += (V_MAX -c - 1); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k )); + __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1)); + + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + + _mm256_storeu_si256(acc + (r*O_MAX + k ), acc0 ^ temp[k ] ^ _mm256_slli_epi16(t,4)); + _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t); + } + } +} + + +static +inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *L = (__m256i *) _L; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +static +inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *Pv = (__m256i *) _Pv; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +static +inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular +// same as mayo_12_P1_times_Vt_avx2 +static +inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc); +} + +static +inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc); +} + +static +inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){ + const __m256i *PS2 = (__m256i *) _PS2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=0; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P2 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + + // P1 * S1 + for (c=r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + P1_cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even); + } + } + + // P2 * S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P3 = (__m256i *) _P3; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=r; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P3 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +static inline +void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { + (void) size; + int m_vecs_stored = 0; + + for (int r = 0; r < O_MAX; ++r) { + const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r)); + __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored); + _out[0] = _in[0]; + m_vecs_stored++; + for (int c = r + 1; c < O_MAX; ++c) { + const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c)); + const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r)); + _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored); + _out[0] = _in2[0] ^ _in3[0]; + m_vecs_stored++; + } + } +} + + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_96.h new file mode 100644 index 0000000000..9b3a69d567 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_96.h @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_96_H +#define SHUFFLE_ARITHMETIC_96_H + +#include +#include +#include +#include + + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + cols_used ++; + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + pos += (V_MAX -c - 1); + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + // P1 times S1 + __m256i temp[K_OVER_2*2*2] = {0}; + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + P1_cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1); + } + } + + // P2 times S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (c=r; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){ + mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc); +} + +static +inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-3_avx2/simple_arithmetic.h new file mode 100644 index 0000000000..ce7580f4c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_avx2/simple_arithmetic.h @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SIMPLE_ARITHMETIC_H +#define SIMPLE_ARITHMETIC_H + +// GF(16) multiplication mod x^4 + x + 1 +static inline unsigned char mul_f(unsigned char a, unsigned char b) { + // carryless multiply + unsigned char p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + unsigned char top_p = p & 0xf0; + unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f; + return out; +} + +static inline uint64_t mul_fx8(unsigned char a, uint64_t b) { + // carryless multiply + uint64_t p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0; + uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f; + return out; +} + +// GF(16) addition +static inline unsigned char add_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) subtraction +static inline unsigned char sub_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) negation +static inline unsigned char neg_f(unsigned char a) { + return a; +} + +static inline unsigned char inverse_f(unsigned char a) { + // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5, + // 10, 4, 3, 8}; return table[a & 15]; + + unsigned char a2 = mul_f(a, a); + unsigned char a4 = mul_f(a2, a2); + unsigned char a8 = mul_f(a4, a4); + unsigned char a6 = mul_f(a2, a4); + unsigned char a14 = mul_f(a8, a6); + + return a14; +} + +static inline unsigned char lincomb(const unsigned char *a, + const unsigned char *b, int n, int m) { + unsigned char ret = 0; + for (int i = 0; i < n; ++i, b += m) { + ret = add_f(mul_f(a[i], *b), ret); + } + return ret; +} + +static inline void mat_mul(const unsigned char *a, const unsigned char *b, + unsigned char *c, int colrow_ab, int row_a, int col_b) { + for (int i = 0; i < row_a; ++i, a += colrow_ab) { + for (int j = 0; j < col_b; ++j, ++c) { + *c = lincomb(a, b + j, colrow_ab, col_b); + } + } +} + +static inline void mat_add(const unsigned char *a, const unsigned char *b, + unsigned char *c, int m, int n) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j)); + } + } +} + +static +inline void m_vec_copy(int m_legs, const uint64_t *in, + uint64_t *out) { + for (int i = 0; i < m_legs * 2; i++) { + out[i] = in[i]; + } +} + +static +inline void m_vec_add(int m_legs, const uint64_t *in, + uint64_t *acc) { + for (int i = 0; i < m_legs * 2; i++) { + acc[i] ^= in[i]; + } +} + +// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1) +// gf16 := gf2[x]/(x^4+x+1) + +static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) { + uint32_t a_msb; + uint32_t a32 = a; + uint32_t b32 = b; + uint32_t r32 = a32 * (b32 & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 1) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 2) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 3) & 1); + + return r32; + +} + +static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < m_legs*2;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) { + for(int i=0;i +#include + +void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output); +#define AES_ECB_encrypt AES_256_ECB + +#ifdef ENABLE_AESNI +int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +#define AES_128_CTR AES_128_CTR_NI +#else +#include +static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen) { + (void) inputByteLen; + uint8_t iv[12] = { 0 }; + aes128ctr_prf(output, outputByteLen, input, iv); + return (int) outputByteLen; +} +#endif + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/api.c b/src/sig/mayo/pqmayo_mayo-3_opt/api.c new file mode 100644 index 0000000000..5c42eabc48 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/api.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +#define MAYO_PARAMS &MAYO_3 +#else +#define MAYO_PARAMS 0 +#endif + +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { + return mayo_keypair(MAYO_PARAMS, pk, sk); +} + +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk) { + return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk); +} + +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk) { + return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk); +} + +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk) { + return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk); +} + +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk) { + if (siglen != CRYPTO_BYTES) + return -1; + return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk); +} + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/api.h b/src/sig/mayo/pqmayo_mayo-3_opt/api.h new file mode 100644 index 0000000000..b08c24704e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/api.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef api_h +#define api_h + +#include + +#define CRYPTO_SECRETKEYBYTES 32 +#define CRYPTO_PUBLICKEYBYTES 2656 +#define CRYPTO_BYTES 577 + +#define CRYPTO_ALGNAME "MAYO-3" + +#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair) +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +#define crypto_sign MAYO_NAMESPACE(crypto_sign) +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk); + +#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature) +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open) +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk); + +#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify) +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk); + +#endif /* api_h */ + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.c new file mode 100644 index 0000000000..de09954c8a --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { +#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64) + mayo12_m_upper(m_legs, in, out, size); +#else + int m_vecs_stored = 0; + for (int r = 0; r < size; r++) { + for (int c = r; c < size; c++) { + m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored ); + if (r != c) { + m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored ); + } + m_vecs_stored ++; + } + } +#endif +} + +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){ +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + (void) p; + mayo_12_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + (void) p; + mayo_3_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + (void) p; + mayo_5_P1P1t_times_O(P1, O, acc); +#else + #ifndef MAYO_VARIANT + const int m_legs = PARAM_m(p) / 32; + #else + (void) p; + #endif + const int param_o = PARAM_o(p); + const int param_v = PARAM_n(p) - PARAM_o(p);; + + int + bs_mat_entries_used = 0; + for (int r = 0; r < param_v; r++) { + for (int c = r; c < param_v; c++) { + if(c==r) { + bs_mat_entries_used += 1; + continue; + } + for (int k = 0; k < param_o; k += 1) { + +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k)); + vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k)); + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k)); + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k)); +#else + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k)); + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k)); +#endif + + } + bs_mat_entries_used += 1; + } + } +#endif +} + +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_12_Vt_times_L_avx2(L, V_multabs, M); + mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_3_Vt_times_L_avx2(L, V_multabs, M); + mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_5_Vt_times_L_avx2(L, V_multabs, M); + mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mul_add_mat_x_m_mat(param_m / 32, V, L, M, + param_k, param_n - param_o, param_o); + + mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1); + + mul_add_mat_x_m_mat(param_m / 32, V, Pv, + Y, param_k, param_n - param_o, + param_k); +#endif +} + +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int m_legs = PARAM_m(p) / 32; + mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1); + mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o); +#endif +} + + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ] +// [ P3*S2 = P2 ] +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS) { + (void) m; +#if MAYO_AVX + const int n = o + v; + + /* Old approach which is constant time but doesn't have to be */ + unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K + unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + +#if M_MAX == 64 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 96 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 128 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS); + mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#else + NOT IMPLEMENTED +#endif +#else + const int n = o + v; + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS); + mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS); +#endif +} + + +// sample a solution x to Ax = y, with r used as randomness +// require: +// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are +// not important, they will be overwritten by y) in row major order +// - y is a vector with m elements +// - r and x are k*o bytes long +// return: 1 on success, 0 on failure +int sample_solution(const mayo_params_t *p, unsigned char *A, + const unsigned char *y, const unsigned char *r, + unsigned char *x, int k, int o, int m, int A_cols) { + #ifdef MAYO_VARIANT + (void) p; + #endif + unsigned char finished; + int col_upper_bound; + unsigned char correct_column; + + // x <- r + for (int i = 0; i < k * o; i++) { + x[i] = r[i]; + } + + // compute Ar; + unsigned char Ar[M_MAX]; + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = 0; // clear last col of A + } + mat_mul(A, r, Ar, k * o + 1, m, 1); + + // move y - Ar to last column of matrix A + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]); + } + + EF(A, m, k * o + 1); + + // check if last row of A (excluding the last entry of y) is zero + unsigned char full_rank = 0; + for (int i = 0; i < A_cols - 1; i++) { + full_rank |= A[(m - 1) * A_cols + i]; + } + +// It is okay to leak if we need to restart or not +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1); +#endif + + if (full_rank == 0) { + return 0; + } + + // back substitution in constant time + // the index of the first nonzero entry in each row is secret, which makes + // things less efficient + + for (int row = m - 1; row >= 0; row--) { + finished = 0; + col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o); + // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32} + + for (int col = row; col <= col_upper_bound; col++) { + + // Compare two chars in constant time. + // Returns 0x00 if the byte arrays are equal, 0xff otherwise. + correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished; + + unsigned char u = correct_column & A[row * A_cols + A_cols - 1]; + x[col] ^= u; + + for (int i = 0; i < row; i += 8) { + uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8) + ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24) + ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40) + ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56); + + tmp = mul_fx8(u, tmp); + + A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf; + A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf; + A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf; + A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf; + A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf; + A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf; + A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf; + A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf; + } + + finished = finished | correct_column; + } + } + return 1; +} + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.h new file mode 100644 index 0000000000..c2fb4fe334 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.h @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_H +#define ARITHMETIC_H + +#include +#include +#include + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define TARGET_BIG_ENDIAN +#endif + +#if defined(MAYO_AVX) && (M_MAX == 64) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 96) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 128) + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128)) + #include +#endif + +// Calculate P3 = O^T * (P1*O + P2) in KeyGen +#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2) +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3); + +// Calculate Upper in KeyGen +#define m_upper MAYO_NAMESPACE(m_upper) +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size); + +// Calculate acc = (P1+P1^T)*O in expand_sk +#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O) +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc); + +// Calculate M=V*L and Y=V*P1*V^T in Sign +#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt) +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y); + +// Sample solution in Sign +#define sample_solution MAYO_NAMESPACE(sample_solution) +int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols); + +// Calculate SPS = S*P*S^T in Verify +#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS) +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_128.h new file mode 100644 index 0000000000..418c308e2f --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_128.h @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_128_H +#define ARITHMETIC_128_H + +#include +#include +#include + +// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_128(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; +} + + +static +inline void vec_add_128(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; + acc[6] ^= in[6]; + acc[7] ^= in[7]; +} + +inline +static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 8;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 8;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 8;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void multiply_bins_128(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8); + m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8); + m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8); + m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8); + m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8); + m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8); + m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8); + m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8); + m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8); + m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8); + m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8); + m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8); + m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8); + m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8); + vec_copy_128(bins + 8, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_64.h new file mode 100644 index 0000000000..a70b7a3118 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_64.h @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_64_H +#define ARITHMETIC_64_H + +#include +#include + +// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_64(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; +} + +static +inline void vec_add_64(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; +} + +static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) { + uint64_t mask_msb = 0x8888888888888888ULL; + uint64_t a_msb; + uint64_t a64 = a; + uint64_t b32 = b; + uint64_t r64 = a64 * (b32 & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 1) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 2) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 3) & 1); + + return r64; +} + +static inline uint32_t mul_table(uint8_t b){ + uint32_t x = ((uint32_t) b) * 0x08040201; + + uint32_t high_nibble_mask = 0xf0f0f0f0; + + uint32_t high_half = x & high_nibble_mask; + return (x ^ (high_half >> 4) ^ (high_half >> 3)); +} + +static +inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 4;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < legs; i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +inline +static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 4;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 4;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void multiply_bins_64(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4); + m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4); + m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4); + m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4); + m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4); + m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4); + m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4); + m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4); + m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4); + m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4); + m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4); + m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4); + m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4); + m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4); + vec_copy_64(bins + 4, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_96.h new file mode 100644 index 0000000000..a38f89e454 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_96.h @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_96_H +#define ARITHMETIC_96_H + +#include +#include +#include + +// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_96(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; +} + +static +inline void vec_add_96(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; +} + +inline +static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 6;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 6;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 6;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void multiply_bins_96(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6); + m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6); + m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6); + m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6); + m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6); + m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6); + m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6); + m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6); + m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6); + m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6); + m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6); + m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6); + m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6); + m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6); + vec_copy_96(bins + 6, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_common.h new file mode 100644 index 0000000000..d337bc238c --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_common.h @@ -0,0 +1,469 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_COMMON_H +#define ARITHMETIC_COMMON_H + +#include + +#ifndef MAYO_VARIANT +static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) { + + m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2); + m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 3 * m_legs * 2); + + m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 6 * m_legs * 2); + + m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2); + m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 7 * m_legs * 2); + + m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 4 * m_legs * 2); + + m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 9 * m_legs * 2); + m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 3 * m_legs * 2); + + m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 2 * m_legs * 2); + m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2); + m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_copy(m_legs, bins + 1 * m_legs * 2, out); +} +#endif + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *PS) { + + const int n = o + v; +#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128) + (void)m; +#else + const int m_legs = m / 32; +#endif + + /* Old approach which is constant time but doesn't have to be + unsigned char S1[V_MAX*K_MAX]; + unsigned char S2[O_MAX*K_MAX]; + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1 + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2 + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2. + */ + + // use more stack efficient version for MAYO_3 and MAYO_5 + #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78 + uint64_t accumulator[M_MAX * N_MAX] = {0}; + int P1_used; + int P3_used; + for (int col = 0; col < k; col++) { + for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) { + accumulator[i] = 0; + } + P1_used = 0; + for (int row = 0; row < v; row++) { + for (int j = row; j < v; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8); +#else + bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 ); +#endif + P1_used ++; + } + + for (int j = 0; j < o; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 ); +#else + bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 ); +#endif + } + } + + P3_used = 0; + for (int row = v; row < n; row++) { + for (int j = row; j < n; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8); +#else + bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 ); +#endif + P3_used ++; + } + } + + for (int row = 0; row < n; row++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8); +#else + bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2); +#endif + } + } + + #else + + alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0}; + int P1_used = 0; + for (int row = 0; row < v; row++) { + for (int j = row; j < v; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 ); + } +#endif + P1_used ++; + } + + + for (int j = 0; j < o; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 ); + } +#endif + } + } + + int P3_used = 0; + for (int row = v; row < n; row++) { + for (int j = row; j < n; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 ); + } +#endif + P3_used ++; + } + } + + // multiply stuff according to the bins of the accumulator and add to PS. + int i = 0; + while (i < n * k) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8); + i++; +#else + m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2); + i++; +#endif + } + + #endif +} + + +static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int n, uint64_t *SPS){ + alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0}; + #if !defined(MAYO_VARIANT) + const int m_legs = m/32; + #else + (void) m; + #endif + for (int row = 0; row < k; row++) { + for (int j = 0; j < n; j++) { + for (int col = 0; col < k; col += 1) { + #if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 ); + #elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 ); + #elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 ); + #else + m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 ); + #endif + } + } + } + + // multiply stuff according to the bins of the accumulator and add to PS. + int i = 0; + while (i < k*k) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8); + i++; +#else + m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2); + i++; +#endif + } +} + + +// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc +static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) { + + int bs_mat_entries_used = 0; + for (int r = 0; r < bs_mat_rows; r++) { + for (int c = triangular * r; c < bs_mat_cols; c++) { + for (int k = 0; k < mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k)); +#endif + } + bs_mat_entries_used += 1; + } + } +} + +// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc +static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) { + + int bs_mat_entries_used = 0; + for (int r = 0; r < bs_mat_rows; r++) { + for (int c = triangular * r; c < bs_mat_cols; c++) { + for (int k = 0; k < mat_rows; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k)); +#endif + } + bs_mat_entries_used += 1; + } + } +} + +// multiplies the transpose of a single matrix with m matrices and adds result to acc +static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) { + + for (int r = 0; r < mat_cols; r++) { + for (int c = 0; c < mat_rows; c++) { + for (int k = 0; k < bs_mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k)); +#endif + } + } + } +} + +// multiplies a single matrix with m matrices and adds result to acc +static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) { + + for (int r = 0; r < mat_rows; r++) { + for (int c = 0; c < mat_cols; c++) { + for (int k = 0; k < bs_mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k)); +#endif + } + } + } +} +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo-3_opt/echelon_form.h new file mode 100644 index 0000000000..82505847c9 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/echelon_form.h @@ -0,0 +1,152 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ECHELON_FORM_H +#define ECHELON_FORM_H + +#include +#include +#include +#include + +#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) + +static inline unsigned char +m_extract_element(const uint64_t *in, int index) { + const int leg = index / 16; + const int offset = index % 16; + + return (in[leg] >> (offset*4)) & 0xF; +} + +static inline void +ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) { + int i; + unsigned char *out8 = (unsigned char *)out; + for(i = 0; i+1 < ncols; i += 2){ +#ifdef TARGET_BIG_ENDIAN + out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0) | (in[i+1] << 4); +#else + out8[i/2] = (in[i+0] << 0) | (in[i+1] << 4); +#endif + } + if (ncols % 2 == 1){ +#ifdef TARGET_BIG_ENDIAN + out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0); +#else + out8[i/2] = (in[i+0] << 0); +#endif + } +} + +static inline void +ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) { + const unsigned char *in8 = (const unsigned char *)in; + for(int i = 0; i < legs * 16; i += 2){ +#ifdef TARGET_BIG_ENDIAN + out[i] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF; + out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + out[i] = (in8[i/2]) & 0xF; + out[i+1] = (in8[i/2] >> 4); +#endif + } +} + + +// put matrix in row echelon form with ones on first nonzero entries *in +// constant time* +static inline void EF(unsigned char *A, int nrows, int ncols) { + + alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16]; + alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16]; + alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 }; + + int row_len = (ncols + 15) / 16; + + // nibbleslice the matrix A + for (int i = 0; i < nrows; i++) { + ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols); + } + + // pivot row is secret, pivot col is not + + unsigned char inverse; + int pivot_row = 0; + for (int pivot_col = 0; pivot_col < ncols; pivot_col++) { + + int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols); + int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col); + // the pivot row is guaranteed to be between these lower and upper bounds if + // A has full rank + + // zero out pivot row + for (int i = 0; i < row_len; i++) { + _pivot_row[i] = 0; + _pivot_row2[i] = 0; + } + + // try to get a pivot row in constant time + unsigned char pivot = 0; + uint64_t pivot_is_zero = -1; + for (int row = pivot_row_lower_bound; + row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) { + + uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row); + uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row); + + for (int j = 0; j < row_len; j++) { + _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) & + packed_A[row * row_len + j]; + } + pivot = m_extract_element(_pivot_row, pivot_col); + pivot_is_zero = ~ct_compare_64((int) pivot, 0); + } + + // multiply pivot row by inverse of pivot + inverse = inverse_f(pivot); + vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2); + + // conditionally write pivot row to the correct row, if there is a nonzero + // pivot + for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) { + uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero; + uint64_t do_not_copy = ~do_copy; + for (int col = 0; col < row_len; col++) { + packed_A[row * row_len + col] = + (do_not_copy & packed_A[row * row_len + col]) + + (do_copy & _pivot_row2[col]); + } + } + + // eliminate entries below pivot + for (int row = pivot_row_lower_bound; row < nrows; row++) { + unsigned char below_pivot = (row > pivot_row); + unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col); + + vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim, + packed_A + row * row_len); + } + + pivot_row += (-(int64_t)(~pivot_is_zero)); + } + + unsigned char temp[(O_MAX * K_MAX + 1 + 15)]; + + // unbitslice the matrix A + for (int i = 0; i < nrows; i++) { + ef_unpack_m_vec(row_len, packed_A + i * row_len, temp); + for (int j = 0; j < ncols; j++) { + A[i * ncols + j] = temp[j]; + } + } + + mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15); + mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8); + mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8); + mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/mayo.c b/src/sig/mayo/pqmayo_mayo-3_opt/mayo.c new file mode 100644 index 0000000000..ce1ccd4c71 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/mayo.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define PK_PRF AES_128_CTR + +static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) { + int i; + for (i = 0; i < mdeclen / 2; ++i) { + *mdec++ = m[i] & 0xf; + *mdec++ = m[i] >> 4; + } + + if (mdeclen % 2 == 1) { + *mdec++ = m[i] & 0x0f; + } +} + +static void encode(const unsigned char *m, unsigned char *menc, int mlen) { + int i; + for (i = 0; i < mlen / 2; ++i, m += 2) { + menc[i] = (*m) | (*(m + 1) << 4); + } + + if (mlen % 2 == 1) { + menc[i] = (*m); + } +} + +static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *vPv = _vPv; + uint64_t temp[M_MAX/16] = {0}; + unsigned char *temp_bytes = (unsigned char *) temp; + int k = 0; + for (int i = PARAM_k(p) - 1; i >= 0 ; i--) { + for (int j = i; j < PARAM_k(p); j++) { + // multiply by X (shift up 4 bits) + unsigned char top = temp[k] >> 60; + temp[k] <<= 4; + k--; + for(; k>=0; k--){ + temp[k+1] ^= temp[k] >> 60; + temp[k] <<= 4; + } + // reduce mod f(X) + for (int jj = 0; jj < F_TAIL_LEN; jj++) { + if(jj%2 == 0){ +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#endif + } + else { +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#endif + } + } + + // extract from vPv and add + for(k=0; k < PARAM_m(p)/16; k ++){ + temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]); + } + k--; + } + } + + // add to y + for (int i = 0; i < PARAM_m(p); i+=2) + { +#ifdef TARGET_BIG_ENDIAN + y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + y[i] = t[i] ^ (temp_bytes[i/2] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4); +#endif + + } +} + +static void transpose_16x16_nibbles(uint64_t *M){ + static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f; + static const uint64_t even_bytes = 0x00ff00ff00ff00ff; + static const uint64_t even_2bytes = 0x0000ffff0000ffff; + static const uint64_t even_half = 0x00000000ffffffff; + + for (size_t i = 0; i < 16; i+=2) + { + uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; + M[i ] ^= t << 4; + M[i+1] ^= t; + } + + for (size_t i = 0; i < 16; i+=4) + { + uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes; + uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; + M[i ] ^= (t0 << 8); + M[i+1] ^= (t1 << 8); + M[i+2] ^= t0; + M[i+3] ^= t1; + } + + for (size_t i = 0; i < 4; i++) + { + uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes; + uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes; + + M[i ] ^= t0 << 16; + M[i+ 8] ^= t1 << 16; + M[i+ 4] ^= t0; + M[i+12] ^= t1; + } + + for (size_t i = 0; i < 8; i++) + { + uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; + M[i ] ^= t << 32; + M[i+8] ^= t; + } +} + +#define MAYO_M_OVER_8 ((M_MAX + 7) / 8) + +static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *VtL = _VtL; + int bits_to_shift = 0; + int words_to_shift = 0; + uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0}; + size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16; + const uint64_t *Mi, *Mj; + + for (int i = 0; i <= PARAM_k(p) - 1; ++i) { + for (int j = PARAM_k(p) - 1; j >= i; --j) { + // add the M_i and M_j to A, shifted "down" by l positions + Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + + if (i != j) { + Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + } + + bits_to_shift += 4; + if(bits_to_shift == 64){ + words_to_shift ++; + bits_to_shift = 0; + } + } + } + + for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16) + { + transpose_16x16_nibbles(A + c); + } + + unsigned char tab[F_TAIL_LEN*4] = {0}; + for (size_t i = 0; i < F_TAIL_LEN; i++) + { + tab[4*i] = mul_f(PARAM_f_tail(p)[i],1); + tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2); + tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4); + tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8); + } + + uint64_t low_bit_in_nibble = 0x1111111111111111; + + for (size_t c = 0; c < A_width; c+= 16) + { + for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++) + { + size_t pos = (r/16)*A_width + c + (r%16); + uint64_t t0 = A[pos] & low_bit_in_nibble; + uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble; + uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble; + uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble; + for (size_t t = 0; t < F_TAIL_LEN; t++) + { + A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3]; + } + } + } + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) + A[i] = BSWAP64(A[i]); +#endif + + for (int r = 0; r < PARAM_m(p); r+=16) + { + for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16) + { + for (size_t i = 0; i < 16; i++) + { + decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c)); + } + } + } +} + +// Public API + +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) { + int ret = 0; + + ret = mayo_keypair_compact(p, pk, sk); + if (ret != MAYO_OK) { + goto err; + } + +err: + return ret; +} + +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data + unsigned char y[M_MAX]; // secret data + unsigned char salt[SALT_BYTES_MAX]; // not secret data + unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX], + Vdec[N_MINUS_O_MAX * K_MAX]; // secret data + unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data + unsigned char x[K_MAX * N_MAX]; // not secret data + unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data + unsigned char s[K_MAX * N_MAX]; // not secret data + const unsigned char *seed_sk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data + alignas(32) sk_t sk; // secret data + unsigned char Ox[N_MINUS_O_MAX]; // secret data + // unsigned char Mdigest[DIGEST_BYTES]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1]; + unsigned char *ctrbyte; + unsigned char *vi; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_v_bytes = PARAM_v_bytes(p); + const int param_r_bytes = PARAM_r_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P2_bytes = PARAM_P2_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_A_cols = PARAM_A_cols(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + ret = mayo_expand_sk(p, csk, &sk); + if (ret != MAYO_OK) { + goto err; + } + + seed_sk = csk; + decode(sk.o, O, (param_n - param_o) * param_o); + + // hash message + shake256(tmp, param_digest_bytes, m, mlen); + + uint64_t *P1 = sk.p; + uint64_t *L = P1 + (param_P1_bytes/8); + alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0}; + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + L[i] = BSWAP64(L[i]); + } +#endif + + // choose the randomizer + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(tmp + param_digest_bytes, param_salt_bytes); + #else + if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // hashing to salt + memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk, + param_sk_seed_bytes); + shake256(salt, param_salt_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret +#endif + + // hashing to t + memcpy(tmp + param_digest_bytes, salt, param_salt_bytes); + ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes; + + shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + + decode(tenc, t, param_m); // may not be necessary + + for (int ctr = 0; ctr <= 255; ++ctr) { + *ctrbyte = (unsigned char)ctr; + + shake256(V, param_k * param_v_bytes + param_r_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1); + + // decode the v_i vectors + for (int i = 0; i <= param_k - 1; ++i) { + decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o), + param_n - param_o); + } + + // compute all the V * L matrices. + // compute all the V * P1 * V^T matrices. + alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0}; + V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y); + + compute_rhs(p, Y, t, y); + compute_A(p, Mtmp, A); + + decode(V + param_k * param_v_bytes, r, + param_k * + param_o); + if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) { + break; + } else { + memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t)); + } + } + + // s is already 0 + // TODO: optimize this? + for (int i = 0; i <= param_k - 1; ++i) { + vi = Vdec + i * (param_n - param_o); + mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1); + mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1); + memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o); + } + encode(s, sig, param_n * param_k); + memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes); + *siglen = param_sig_bytes; +err: + mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX); + mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX); + mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1)); + mayo_secure_clear(r, K_MAX * O_MAX + 1); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(&sk, sizeof(sk_t)); + mayo_secure_clear(Ox, N_MINUS_O_MAX); + mayo_secure_clear(tmp, + DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1); + return ret; +} + +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + const int param_sig_bytes = PARAM_sig_bytes(p); + size_t siglen = param_sig_bytes; + ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk); + if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes) + goto err; + + memmove(sm + param_sig_bytes, m, mlen); + *smlen = siglen + mlen; +err: + return ret; +} + +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk) { + const int param_sig_bytes = PARAM_sig_bytes(p); + if (smlen < (size_t)param_sig_bytes) { + return MAYO_ERR; + } + int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm, + pk); + + if (result == MAYO_OK) { + *mlen = smlen - param_sig_bytes; + memmove(m, sm + param_sig_bytes, *mlen); + } + + return result; +} + +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk) { + int ret = MAYO_OK; + unsigned char *seed_sk = csk; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8]; + alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0}; + + unsigned char *seed_pk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_m = PARAM_m(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + // seed_sk $←- B^(sk_seed bytes) + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(seed_sk, param_sk_seed_bytes); + #else + if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // S ← shake256(seedsk, pk seed bytes + O bytes) + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + // seed_pk ← s[0 : pk_seed_bytes] + seed_pk = S; + + // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes]) + decode(S + param_pk_seed_bytes, O, param_v * param_o); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + + int m_legs = param_m / 32; + + uint64_t *P1 = P; + uint64_t *P1O_P2 = P + (param_P1_bytes / 8); + + // compute P3 = O^t * (P1*O + P2) + Ot_times_P1O_P2(p, P1, O, P1O_P2, P3); + + // store seed_pk in cpk + memcpy(cpk, seed_pk, param_pk_seed_bytes); + + alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8]; + + // compute Upper(P3) and store in cpk + m_upper(m_legs, P3, P3_upper, param_o); + + memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes); + + +#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL) +err: +#endif + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2); + return ret; +} + +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *pk) { + #ifdef MAYO_VARIANT + (void)p; + #endif + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes); + pk += param_P1_bytes + param_P2_bytes; + memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes); + return MAYO_OK; +} + +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *sk) { + int ret = MAYO_OK; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + uint64_t *P = sk->p; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_o = PARAM_o(p); + const int param_v = PARAM_v(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + const unsigned char *seed_sk = csk; + unsigned char *seed_pk = S; + + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + decode(S + param_pk_seed_bytes, O, + param_v * param_o); // O = S + PK_SEED_BYTES; + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + uint64_t *P2 = P + (param_P1_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + uint64_t *P1 = P; + // compute L_i = (P1 + P1^t)*O + P2 + uint64_t *L = P2; + P1P1t_times_O(p, P1, O, L); + + // write to sk + memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + return ret; +} + +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *cpk) { + unsigned char tEnc[M_BYTES_MAX]; + unsigned char t[M_MAX]; + unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X) + unsigned char s[K_MAX * N_MAX]; + alignas (64) uint64_t pk[EPK_BYTES_MAX / 8]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX]; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P3_bytes = PARAM_P3_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk); + if (ret != MAYO_OK) { + return MAYO_ERR; + } + + uint64_t *P1 = pk; + uint64_t *P2 = pk + (param_P1_bytes / 8); + uint64_t *P3 = P2 + (param_P2_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + P2[i] = BSWAP64(P2[i]); + } + for (int i = 0; i < param_P3_bytes / 8; ++i) { + P3[i] = BSWAP64(P3[i]); + } +#endif + + // hash m + shake256(tmp, param_digest_bytes, m, mlen); + + // compute t + memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes, + param_salt_bytes); + shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + decode(tEnc, t, param_m); + + // decode s + decode(sig, s, param_k * param_n); + + // Compute S*P*S^T + alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0}; + + m_calculate_PS_SPS(P1, P2, P3, s, param_m, + param_v, param_o, param_k, SPS); + + // combine the vectors in SPS and reduce mod f(X) + compute_rhs(p, SPS, y, y); + + if (memcmp(y, t, param_m) == 0) { + return MAYO_OK; // good signature + } + return MAYO_ERR; // bad signature +} + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/mayo.h b/src/sig/mayo/pqmayo_mayo-3_opt/mayo.h new file mode 100644 index 0000000000..1de4bf2a33 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/mayo.h @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MAYO_H +#define MAYO_H + +#include +#include + +#define F_TAIL_LEN 5 +#define F_TAIL_64 \ + { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3 +#define F_TAIL_96 \ + { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x +#define F_TAIL_128 \ + { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2 + +#define MAYO_1_name "MAYO_1" +#define MAYO_1_n 66 +#define MAYO_1_m 64 +#define MAYO_1_o 8 +#define MAYO_1_v (MAYO_1_n - MAYO_1_o) +#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1) +#define MAYO_1_k 9 +#define MAYO_1_q 16 +#define MAYO_1_m_bytes 32 +#define MAYO_1_O_bytes 232 +#define MAYO_1_v_bytes 29 +#define MAYO_1_r_bytes 36 +#define MAYO_1_P1_bytes 54752 +#define MAYO_1_P2_bytes 14848 +#define MAYO_1_P3_bytes 1152 +#define MAYO_1_csk_bytes 24 +#define MAYO_1_esk_bytes 69856 +#define MAYO_1_cpk_bytes 1168 +#define MAYO_1_epk_bytes 70752 +#define MAYO_1_sig_bytes 321 +#define MAYO_1_f_tail F_TAIL_64 +#define MAYO_1_f_tail_arr f_tail_64 +#define MAYO_1_salt_bytes 24 +#define MAYO_1_digest_bytes 32 +#define MAYO_1_pk_seed_bytes 16 +#define MAYO_1_sk_seed_bytes 24 + +#define MAYO_2_name "MAYO_2" +#define MAYO_2_n 78 +#define MAYO_2_m 64 +#define MAYO_2_o 18 +#define MAYO_2_v (MAYO_2_n - MAYO_2_o) +#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1) +#define MAYO_2_k 4 +#define MAYO_2_q 16 +#define MAYO_2_m_bytes 32 +#define MAYO_2_O_bytes 540 +#define MAYO_2_v_bytes 30 +#define MAYO_2_r_bytes 36 +#define MAYO_2_P1_bytes 58560 +#define MAYO_2_P2_bytes 34560 +#define MAYO_2_P3_bytes 5472 +#define MAYO_2_csk_bytes 24 +#define MAYO_2_esk_bytes 93684 +#define MAYO_2_cpk_bytes 5488 +#define MAYO_2_epk_bytes 98592 +#define MAYO_2_sig_bytes 180 +#define MAYO_2_f_tail F_TAIL_64 +#define MAYO_2_f_tail_arr f_tail_64 +#define MAYO_2_salt_bytes 24 +#define MAYO_2_digest_bytes 32 +#define MAYO_2_pk_seed_bytes 16 +#define MAYO_2_sk_seed_bytes 24 + +#define MAYO_3_name "MAYO_3" +#define MAYO_3_n 99 +#define MAYO_3_m 96 +#define MAYO_3_o 10 +#define MAYO_3_v (MAYO_3_n - MAYO_3_o) +#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1) +#define MAYO_3_k 11 +#define MAYO_3_q 16 +#define MAYO_3_m_bytes 48 +#define MAYO_3_O_bytes 445 +#define MAYO_3_v_bytes 45 +#define MAYO_3_r_bytes 55 +#define MAYO_3_P1_bytes 192240 +#define MAYO_3_P2_bytes 42720 +#define MAYO_3_P3_bytes 2640 +#define MAYO_3_csk_bytes 32 +#define MAYO_3_esk_bytes 235437 +#define MAYO_3_cpk_bytes 2656 +#define MAYO_3_epk_bytes 237600 +#define MAYO_3_sig_bytes 577 +#define MAYO_3_f_tail F_TAIL_96 +#define MAYO_3_f_tail_arr f_tail_96 +#define MAYO_3_salt_bytes 32 +#define MAYO_3_digest_bytes 48 +#define MAYO_3_pk_seed_bytes 16 +#define MAYO_3_sk_seed_bytes 32 + +#define MAYO_5_name "MAYO_5" +#define MAYO_5_n 133 +#define MAYO_5_m 128 +#define MAYO_5_o 12 +#define MAYO_5_v (MAYO_5_n - MAYO_5_o) +#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1) +#define MAYO_5_k 12 +#define MAYO_5_q 16 +#define MAYO_5_m_bytes 64 +#define MAYO_5_O_bytes 726 +#define MAYO_5_v_bytes 61 +#define MAYO_5_r_bytes 72 +#define MAYO_5_P1_bytes 472384 +#define MAYO_5_P2_bytes 92928 +#define MAYO_5_P3_bytes 4992 +#define MAYO_5_csk_bytes 40 +#define MAYO_5_esk_bytes 566078 +#define MAYO_5_cpk_bytes 5008 +#define MAYO_5_epk_bytes 570304 +#define MAYO_5_sig_bytes 838 +#define MAYO_5_f_tail F_TAIL_128 +#define MAYO_5_f_tail_arr f_tail_128 +#define MAYO_5_salt_bytes 40 +#define MAYO_5_digest_bytes 64 +#define MAYO_5_pk_seed_bytes 16 +#define MAYO_5_sk_seed_bytes 40 + +#define PARAM_JOIN2_(a, b) a##_##b +#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b) +#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end) + +#if defined(MAYO_VARIANT) +#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c +#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c) +#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s) + +#if defined(MAYO_BUILD_TYPE_REF) +#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s) +#elif defined(MAYO_BUILD_TYPE_OPT) +#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s) +#elif defined(MAYO_BUILD_TYPE_AVX2) +#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s) +#else +#error "Build type not known" +#endif + +#else +#define MAYO_NAMESPACE(s) s +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define NAME_MAX mayo5 +#define N_MAX 133 +#define M_MAX 128 +#define O_MAX 18 +#define V_MAX 121 +#define K_MAX 12 +#define Q_MAX 16 +#define PK_SEED_BYTES_MAX 16 +#define SK_SEED_BYTES_MAX 40 +#define SALT_BYTES_MAX 40 +#define DIGEST_BYTES_MAX 64 +#define O_BYTES_MAX 726 +#define V_BYTES_MAX 61 +#define R_BYTES_MAX 72 +#define P1_BYTES_MAX 472384 +#define P2_BYTES_MAX 92928 +#define P3_BYTES_MAX 5472 +#define SIG_BYTES_MAX 838 +#define CPK_BYTES_MAX 5488 +#define CSK_BYTES_MAX 40 +#define ESK_BYTES_MAX 566078 +#define EPK_BYTES_MAX 570304 +#define N_MINUS_O_MAX 121 +#define M_BYTES_MAX 64 +#elif defined(MAYO_VARIANT) +#define M_MAX PARAM_NAME(m) +#define N_MAX PARAM_NAME(n) +#define O_MAX PARAM_NAME(o) +#define V_MAX PARAM_NAME(v) +#define K_MAX PARAM_NAME(k) +#define Q_MAX PARAM_NAME(q) +#define M_BYTES_MAX PARAM_NAME(m_bytes) +#define O_BYTES_MAX PARAM_NAME(O_bytes) +#define V_BYTES_MAX PARAM_NAME(v_bytes) +#define R_BYTES_MAX PARAM_NAME(r_bytes) +#define P1_BYTES_MAX PARAM_NAME(P1_bytes) +#define P2_BYTES_MAX PARAM_NAME(P2_bytes) +#define P3_BYTES_MAX PARAM_NAME(P3_bytes) +#define SIG_BYTES_MAX PARAM_NAME(sig_bytes) +#define CSK_BYTES_MAX PARAM_NAME(csk_bytes) +#define ESK_BYTES_MAX PARAM_NAME(esk_bytes) +#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes) +#define EPK_BYTES_MAX PARAM_NAME(epk_bytes) +#define N_MINUS_O_MAX (N_MAX - O_MAX) +#define SALT_BYTES_MAX PARAM_NAME(salt_bytes) +#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes) +#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes) +#define SK_SEED_BYTES_MAX SALT_BYTES_MAX +#else +#error "Parameter not specified" +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define PARAM_name(p) (p->name) +#define PARAM_m(p) (p->m) +#define PARAM_n(p) (p->n) +#define PARAM_o(p) (p->o) +#define PARAM_v(p) (p->n - p->o) +#define PARAM_A_cols(p) (p->k * p->o + 1) +#define PARAM_k(p) (p->k) +#define PARAM_q(p) (p->q) +#define PARAM_m_bytes(p) (p->m_bytes) +#define PARAM_O_bytes(p) (p->O_bytes) +#define PARAM_v_bytes(p) (p->v_bytes) +#define PARAM_r_bytes(p) (p->r_bytes) +#define PARAM_P1_bytes(p) (p->P1_bytes) +#define PARAM_P2_bytes(p) (p->P2_bytes) +#define PARAM_P3_bytes(p) (p->P3_bytes) +#define PARAM_csk_bytes(p) (p->csk_bytes) +#define PARAM_esk_bytes(p) (p->esk_bytes) +#define PARAM_cpk_bytes(p) (p->cpk_bytes) +#define PARAM_epk_bytes(p) (p->epk_bytes) +#define PARAM_sig_bytes(p) (p->sig_bytes) +#define PARAM_f_tail(p) (p->f_tail) +#define PARAM_salt_bytes(p) (p->salt_bytes) +#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes) +#define PARAM_digest_bytes(p) (p->digest_bytes) +#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes) +#elif defined(MAYO_VARIANT) +#define PARAM_name(p) PARAM_NAME(name) +#define PARAM_m(p) PARAM_NAME(m) +#define PARAM_n(p) PARAM_NAME(n) +#define PARAM_o(p) PARAM_NAME(o) +#define PARAM_v(p) PARAM_NAME(v) +#define PARAM_A_cols(p) PARAM_NAME(A_cols) +#define PARAM_k(p) PARAM_NAME(k) +#define PARAM_q(p) PARAM_NAME(q) +#define PARAM_m_bytes(p) PARAM_NAME(m_bytes) +#define PARAM_O_bytes(p) PARAM_NAME(O_bytes) +#define PARAM_v_bytes(p) PARAM_NAME(v_bytes) +#define PARAM_r_bytes(p) PARAM_NAME(r_bytes) +#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes) +#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes) +#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes) +#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes) +#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes) +#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes) +#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes) +#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes) +static const unsigned char f_tail[] = PARAM_NAME(f_tail); +#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes) +#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes) +#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes) +#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes) +#define PARAM_f_tail(p) f_tail +#else +#error "Parameter not specified" +#endif + +/** + * Struct defining MAYO parameters + */ +typedef struct { + int m; + int n; + int o; + int k; + int q; + const unsigned char *f_tail; + int m_bytes; + int O_bytes; + int v_bytes; + int r_bytes; + int R_bytes; + int P1_bytes; + int P2_bytes; + int P3_bytes; + int csk_bytes; + int esk_bytes; + int cpk_bytes; + int epk_bytes; + int sig_bytes; + int salt_bytes; + int sk_seed_bytes; + int digest_bytes; + int pk_seed_bytes; + const char *name; +} mayo_params_t; + +typedef struct sk_t { + uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8]; + uint8_t o[O_BYTES_MAX]; +} sk_t; + +/** + * MAYO parameter sets + */ +#ifdef ENABLE_PARAMS_DYNAMIC +extern const mayo_params_t MAYO_1; +extern const mayo_params_t MAYO_2; +extern const mayo_params_t MAYO_3; +extern const mayo_params_t MAYO_5; +#endif + +/** + * Status codes + */ +#define MAYO_OK 0 +#define MAYO_ERR 1 + +/** + * Mayo keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * @param[in] p Mayo parameter set + * @param[out] pk Mayo public key + * @param[out] sk Mayo secret key + * @return int status code + */ +#define mayo_keypair MAYO_NAMESPACE(mayo_keypair) +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk); + +#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature) +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk); + +/** + * MAYO signature generation. + * + * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec. + * Keys provided is a compacted secret keys. + * The caller is responsible to allocate sufficient memory to hold sm. + * + * @param[in] p Mayo parameter set + * @param[out] sm Signature concatenated with message + * @param[out] smlen Pointer to the length of sm + * @param[in] m Message to be signed + * @param[in] mlen Message length + * @param[in] sk Compacted secret key + * @return int status code + */ +#define mayo_sign MAYO_NAMESPACE(mayo_sign) +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +/** + * Mayo open signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m. + * Keys provided is a compact public key. + * The caller is responsible to allocate sufficient memory to hold m. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sm Signature concatenated with message + * @param[in] smlen Length of sm + * @param[in] pk Compacted public key + * @return int status code + */ +#define mayo_open MAYO_NAMESPACE(mayo_open) +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk); + +/** + * Mayo compact keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and + * cpk are compact representations of a Mayo secret key and public key + * + * @param[in] p Mayo parameter set + * @param[out] cpk Mayo compacted public key + * @param[out] csk Mayo compacted secret key + * @return int status code + */ +#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact) +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk); + +/** + * Mayo expand public key. + * + * The implementation corresponds to Mayo.expandPK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold epk. + * + * @param[in] p Mayo parameter set + * @param[in] cpk Compacted public key. + * @param[out] epk Expanded public key. + * @return int return code + */ +#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk) +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *epk); + +/** + * Mayo expand secret key. + * + * The implementation corresponds to Mayo.expandSK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold esk. + * + * @param[in] p Mayo parameter set + * @param[in] csk Compacted secret key. + * @param[out] esk Expanded secret key. + * @return int return code + */ +#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk) +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *esk); + +/** + * Mayo verify signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1. + * Keys provided is a compact public key. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sig Signature + * @param[in] pk Compacted public key + * @return int 0 if verification succeeded, 1 otherwise. + */ +#define mayo_verify MAYO_NAMESPACE(mayo_verify) +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *pk); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/mem.h b/src/sig/mayo/pqmayo_mayo-3_opt/mem.h new file mode 100644 index 0000000000..2aa2196e2e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/mem.h @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MEM_H +#define MEM_H +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#define BSWAP32(i) __builtin_bswap32((i)) +#define BSWAP64(i) __builtin_bswap64((i)) +#else +#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24)) +#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)) +#endif + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint32_t ct_is_greater_than(int a, int b) { + int32_t diff = b - a; + return (uint32_t) (diff >> (8*sizeof(uint32_t)-1)); +} + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint64_t ct_64_is_greater_than(int a, int b) { + int64_t diff = ((int64_t) b) - ((int64_t) a); + return (uint64_t) (diff >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00000000, else 0xFFFFFFFF +static inline uint32_t ct_compare_32(int a, int b) { + return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF +static inline uint64_t ct_compare_64(int a, int b) { + return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00, else 0xFF +static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) { + return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +#include +/** + * Clears and frees allocated memory. + * + * @param[out] mem Memory to be cleared and freed. + * @param size Size of memory to be cleared and freed. + */ +static inline void mayo_secure_free(void *mem, size_t size) { + OQS_MEM_secure_free(mem, size); +} + +/** + * Clears memory. + * + * @param[out] mem Memory to be cleared. + * @param size Size of memory to be cleared. + */ +static inline void mayo_secure_clear(void *mem, size_t size) { + OQS_MEM_cleanse(mem, size); +} + +#endif diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/params.c b/src/sig/mayo/pqmayo_mayo-3_opt/params.c new file mode 100644 index 0000000000..043d4cedc1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/params.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +static const unsigned char f_tail_64[] = F_TAIL_64; +static const unsigned char f_tail_96[] = F_TAIL_96; +static const unsigned char f_tail_128[] = F_TAIL_128; + +#define MAYO_GEN_PARAMS(nm) \ + const mayo_params_t nm = { \ + .m = PARAM_JOIN2(nm, m), \ + .n = PARAM_JOIN2(nm, n), \ + .o = PARAM_JOIN2(nm, o), \ + .k = PARAM_JOIN2(nm, k), \ + .q = PARAM_JOIN2(nm, q), \ + .f_tail = PARAM_JOIN2(nm, f_tail_arr), \ + .m_bytes = PARAM_JOIN2(nm, m_bytes), \ + .O_bytes = PARAM_JOIN2(nm, O_bytes), \ + .v_bytes = PARAM_JOIN2(nm, v_bytes), \ + .r_bytes = PARAM_JOIN2(nm, r_bytes), \ + .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \ + .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \ + .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \ + .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \ + .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \ + .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \ + .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \ + .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \ + .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \ + .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \ + .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \ + .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \ + .name = #nm \ + }; + +MAYO_GEN_PARAMS(MAYO_1); +MAYO_GEN_PARAMS(MAYO_2); +MAYO_GEN_PARAMS(MAYO_3); +MAYO_GEN_PARAMS(MAYO_5); +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-3_opt/simple_arithmetic.h new file mode 100644 index 0000000000..ce7580f4c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-3_opt/simple_arithmetic.h @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SIMPLE_ARITHMETIC_H +#define SIMPLE_ARITHMETIC_H + +// GF(16) multiplication mod x^4 + x + 1 +static inline unsigned char mul_f(unsigned char a, unsigned char b) { + // carryless multiply + unsigned char p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + unsigned char top_p = p & 0xf0; + unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f; + return out; +} + +static inline uint64_t mul_fx8(unsigned char a, uint64_t b) { + // carryless multiply + uint64_t p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0; + uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f; + return out; +} + +// GF(16) addition +static inline unsigned char add_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) subtraction +static inline unsigned char sub_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) negation +static inline unsigned char neg_f(unsigned char a) { + return a; +} + +static inline unsigned char inverse_f(unsigned char a) { + // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5, + // 10, 4, 3, 8}; return table[a & 15]; + + unsigned char a2 = mul_f(a, a); + unsigned char a4 = mul_f(a2, a2); + unsigned char a8 = mul_f(a4, a4); + unsigned char a6 = mul_f(a2, a4); + unsigned char a14 = mul_f(a8, a6); + + return a14; +} + +static inline unsigned char lincomb(const unsigned char *a, + const unsigned char *b, int n, int m) { + unsigned char ret = 0; + for (int i = 0; i < n; ++i, b += m) { + ret = add_f(mul_f(a[i], *b), ret); + } + return ret; +} + +static inline void mat_mul(const unsigned char *a, const unsigned char *b, + unsigned char *c, int colrow_ab, int row_a, int col_b) { + for (int i = 0; i < row_a; ++i, a += colrow_ab) { + for (int j = 0; j < col_b; ++j, ++c) { + *c = lincomb(a, b + j, colrow_ab, col_b); + } + } +} + +static inline void mat_add(const unsigned char *a, const unsigned char *b, + unsigned char *c, int m, int n) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j)); + } + } +} + +static +inline void m_vec_copy(int m_legs, const uint64_t *in, + uint64_t *out) { + for (int i = 0; i < m_legs * 2; i++) { + out[i] = in[i]; + } +} + +static +inline void m_vec_add(int m_legs, const uint64_t *in, + uint64_t *acc) { + for (int i = 0; i < m_legs * 2; i++) { + acc[i] ^= in[i]; + } +} + +// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1) +// gf16 := gf2[x]/(x^4+x+1) + +static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) { + uint32_t a_msb; + uint32_t a32 = a; + uint32_t b32 = b; + uint32_t r32 = a32 * (b32 & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 1) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 2) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 3) & 1); + + return r32; + +} + +static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < m_legs*2;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) { + for(int i=0;i +#include + +void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output); +#define AES_ECB_encrypt AES_256_ECB + +#ifdef ENABLE_AESNI +int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +#define AES_128_CTR AES_128_CTR_NI +#else +#include +static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen) { + (void) inputByteLen; + uint8_t iv[12] = { 0 }; + aes128ctr_prf(output, outputByteLen, input, iv); + return (int) outputByteLen; +} +#endif + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/api.c b/src/sig/mayo/pqmayo_mayo-5_avx2/api.c new file mode 100644 index 0000000000..f2e861e9c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/api.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +#define MAYO_PARAMS &MAYO_5 +#else +#define MAYO_PARAMS 0 +#endif + +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { + return mayo_keypair(MAYO_PARAMS, pk, sk); +} + +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk) { + return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk); +} + +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk) { + return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk); +} + +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk) { + return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk); +} + +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk) { + if (siglen != CRYPTO_BYTES) + return -1; + return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk); +} + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/api.h b/src/sig/mayo/pqmayo_mayo-5_avx2/api.h new file mode 100644 index 0000000000..404d185c08 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/api.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef api_h +#define api_h + +#include + +#define CRYPTO_SECRETKEYBYTES 40 +#define CRYPTO_PUBLICKEYBYTES 5008 +#define CRYPTO_BYTES 838 + +#define CRYPTO_ALGNAME "MAYO-5" + +#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair) +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +#define crypto_sign MAYO_NAMESPACE(crypto_sign) +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk); + +#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature) +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open) +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk); + +#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify) +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk); + +#endif /* api_h */ + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.c new file mode 100644 index 0000000000..de09954c8a --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { +#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64) + mayo12_m_upper(m_legs, in, out, size); +#else + int m_vecs_stored = 0; + for (int r = 0; r < size; r++) { + for (int c = r; c < size; c++) { + m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored ); + if (r != c) { + m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored ); + } + m_vecs_stored ++; + } + } +#endif +} + +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){ +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + (void) p; + mayo_12_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + (void) p; + mayo_3_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + (void) p; + mayo_5_P1P1t_times_O(P1, O, acc); +#else + #ifndef MAYO_VARIANT + const int m_legs = PARAM_m(p) / 32; + #else + (void) p; + #endif + const int param_o = PARAM_o(p); + const int param_v = PARAM_n(p) - PARAM_o(p);; + + int + bs_mat_entries_used = 0; + for (int r = 0; r < param_v; r++) { + for (int c = r; c < param_v; c++) { + if(c==r) { + bs_mat_entries_used += 1; + continue; + } + for (int k = 0; k < param_o; k += 1) { + +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k)); + vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k)); + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k)); + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k)); +#else + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k)); + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k)); +#endif + + } + bs_mat_entries_used += 1; + } + } +#endif +} + +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_12_Vt_times_L_avx2(L, V_multabs, M); + mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_3_Vt_times_L_avx2(L, V_multabs, M); + mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_5_Vt_times_L_avx2(L, V_multabs, M); + mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mul_add_mat_x_m_mat(param_m / 32, V, L, M, + param_k, param_n - param_o, param_o); + + mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1); + + mul_add_mat_x_m_mat(param_m / 32, V, Pv, + Y, param_k, param_n - param_o, + param_k); +#endif +} + +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int m_legs = PARAM_m(p) / 32; + mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1); + mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o); +#endif +} + + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ] +// [ P3*S2 = P2 ] +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS) { + (void) m; +#if MAYO_AVX + const int n = o + v; + + /* Old approach which is constant time but doesn't have to be */ + unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K + unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + +#if M_MAX == 64 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 96 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 128 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS); + mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#else + NOT IMPLEMENTED +#endif +#else + const int n = o + v; + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS); + mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS); +#endif +} + + +// sample a solution x to Ax = y, with r used as randomness +// require: +// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are +// not important, they will be overwritten by y) in row major order +// - y is a vector with m elements +// - r and x are k*o bytes long +// return: 1 on success, 0 on failure +int sample_solution(const mayo_params_t *p, unsigned char *A, + const unsigned char *y, const unsigned char *r, + unsigned char *x, int k, int o, int m, int A_cols) { + #ifdef MAYO_VARIANT + (void) p; + #endif + unsigned char finished; + int col_upper_bound; + unsigned char correct_column; + + // x <- r + for (int i = 0; i < k * o; i++) { + x[i] = r[i]; + } + + // compute Ar; + unsigned char Ar[M_MAX]; + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = 0; // clear last col of A + } + mat_mul(A, r, Ar, k * o + 1, m, 1); + + // move y - Ar to last column of matrix A + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]); + } + + EF(A, m, k * o + 1); + + // check if last row of A (excluding the last entry of y) is zero + unsigned char full_rank = 0; + for (int i = 0; i < A_cols - 1; i++) { + full_rank |= A[(m - 1) * A_cols + i]; + } + +// It is okay to leak if we need to restart or not +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1); +#endif + + if (full_rank == 0) { + return 0; + } + + // back substitution in constant time + // the index of the first nonzero entry in each row is secret, which makes + // things less efficient + + for (int row = m - 1; row >= 0; row--) { + finished = 0; + col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o); + // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32} + + for (int col = row; col <= col_upper_bound; col++) { + + // Compare two chars in constant time. + // Returns 0x00 if the byte arrays are equal, 0xff otherwise. + correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished; + + unsigned char u = correct_column & A[row * A_cols + A_cols - 1]; + x[col] ^= u; + + for (int i = 0; i < row; i += 8) { + uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8) + ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24) + ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40) + ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56); + + tmp = mul_fx8(u, tmp); + + A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf; + A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf; + A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf; + A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf; + A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf; + A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf; + A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf; + A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf; + } + + finished = finished | correct_column; + } + } + return 1; +} + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.h new file mode 100644 index 0000000000..c2fb4fe334 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.h @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_H +#define ARITHMETIC_H + +#include +#include +#include + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define TARGET_BIG_ENDIAN +#endif + +#if defined(MAYO_AVX) && (M_MAX == 64) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 96) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 128) + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128)) + #include +#endif + +// Calculate P3 = O^T * (P1*O + P2) in KeyGen +#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2) +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3); + +// Calculate Upper in KeyGen +#define m_upper MAYO_NAMESPACE(m_upper) +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size); + +// Calculate acc = (P1+P1^T)*O in expand_sk +#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O) +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc); + +// Calculate M=V*L and Y=V*P1*V^T in Sign +#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt) +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y); + +// Sample solution in Sign +#define sample_solution MAYO_NAMESPACE(sample_solution) +int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols); + +// Calculate SPS = S*P*S^T in Verify +#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS) +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_128.h new file mode 100644 index 0000000000..27b367e940 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_128.h @@ -0,0 +1,78 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_128_H +#define ARITHMETIC_128_H + +#include +#include +#include + +// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_128(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; +} + + +static +inline void vec_add_128(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; + acc[6] ^= in[6]; + acc[7] ^= in[7]; +} + +inline +static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} +inline +static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 8;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static + inline void multiply_bins_128(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8); + m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8); + m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8); + m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8); + m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8); + m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8); + m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8); + m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8); + m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8); + m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8); + m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8); + m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8); + m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8); + m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8); + vec_copy_128(bins + 8, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_64.h new file mode 100644 index 0000000000..9f7535c878 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_64.h @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_64_H +#define ARITHMETIC_64_H + +#include +#include +#include + +// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_64(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; +} + +static +inline void vec_add_64(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; +} + +static +inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +inline +static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} +inline +static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<4;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void multiply_bins_64(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4); + m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4); + m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4); + m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4); + m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4); + m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4); + m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4); + m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4); + m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4); + m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4); + m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4); + m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4); + m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4); + m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4); + vec_copy_64(bins + 4, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_96.h new file mode 100644 index 0000000000..86359679fb --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_96.h @@ -0,0 +1,74 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_96_H +#define ARITHMETIC_96_H + +#include +#include +#include + +// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_96(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; +} + +static +inline void vec_add_96(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; +} + +inline +static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x2); + } +} + +inline +static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) { + for(int i=0;i<6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], 0x9); + } +} + +static +inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < 6;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static +inline void multiply_bins_96(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6); + m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6); + m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6); + m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6); + m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6); + m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6); + m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6); + m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6); + m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6); + m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6); + m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6); + m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6); + m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6); + m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6); + vec_copy_96(bins + 6, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_common.h new file mode 100644 index 0000000000..eeb13dc0bd --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_common.h @@ -0,0 +1,175 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_COMMON_H +#define ARITHMETIC_COMMON_H + +#include +#include + +#define K_OVER_2 ((K_MAX+1)/2) + +static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = { + 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, + 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, + 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, + 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01 +}; + +// +// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper! +// +static inline __m256i tbl32_gf16_multab2( uint8_t b ) { + + __m256i bx = _mm256_set1_epi16( b & 0xf ); + __m256i b1 = _mm256_srli_epi16( bx, 1 ); + + const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0)); + const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1)); + const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2)); + const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3)); + + __m256i mask_1 = _mm256_set1_epi16(1); + __m256i mask_4 = _mm256_set1_epi16(4); + __m256i mask_0 = _mm256_setzero_si256(); + + return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) ) + ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) ) + ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) ) + ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) ); +} + +static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) { + return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f); +} + +static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) { + __m256i multab_l = tbl32_gf16_multab2( b ); + __m256i multab_h = _mm256_slli_epi16( multab_l, 4 ); + + return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) ); +} + +static +inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){ + // build multiplication tables + for (size_t r = 0; r < V_MAX; r++) + { + for (size_t c = 0; c < O_MAX; c+=2) + { + O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4); + } + } +} + + +static +inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){ + // build multiplication tables + size_t r; + for (size_t c = 0; c < V_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4); + } +#if K_MAX % 2 == 1 + V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]); +#endif + } +} + +static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = { + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, + 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, + 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, + 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, + 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, + 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, + 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, + 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, + 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, + 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, + 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, + 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, + 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, + 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, + 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, + 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, + 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, + 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, + 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, + 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, + 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, + 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, + 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, + 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, + 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, + 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, + 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, + 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a, + 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a}; + + +static +inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) { + size_t r; + for (size_t c = 0; c < V_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])) + ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4); + } +#if K_MAX % 2 == 1 + S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])); +#endif + } +} + +static +inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) { + // build multiplication tables + size_t r; + for (size_t c = 0; c < O_MAX; c++) + { + for (r = 0; r+1 < K_MAX; r+= 2) + { + S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) + ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4); + } +#if K_MAX % 2 == 1 + S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ; +#endif + } +} + +static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) { + uint64_t mask_msb = 0x8888888888888888ULL; + uint64_t a_msb; + uint64_t a64 = a; + uint64_t b32 = b; + uint64_t r64 = a64 * (b32 & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 1) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 2) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 3) & 1); + + return r64; +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form.h new file mode 100644 index 0000000000..fa69de0ab2 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form.h @@ -0,0 +1,88 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + + +#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) + + +// +// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/. +// +static inline __m256i tbl32_gf16_multab( uint8_t b ) { + __m256i bx = _mm256_set1_epi16( b & 0xf ); + __m256i b1 = _mm256_srli_epi16( bx, 1 ); + + const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0)); + const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1)); + const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2)); + const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3)); + + __m256i mask_1 = _mm256_set1_epi16(1); + __m256i mask_4 = _mm256_set1_epi16(4); + __m256i mask_0 = _mm256_setzero_si256(); + + return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) ) + ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) ) + ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) ) + ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) ); +} + +/* put matrix in row echelon form with ones on first nonzero entries in constant time*/ +static inline void EF(unsigned char *A, int _nrows, int _ncols) { + + (void) _nrows; + (void) _ncols; + + #define nrows M_MAX + #define ncols (K_MAX * O_MAX + 1) + + #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32) + #define MAX_COLS (AVX_REGS_PER_ROW * 32) + + __m256i _pivot_row[AVX_REGS_PER_ROW]; + __m256i A_avx[AVX_REGS_PER_ROW* M_MAX]; + + unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row; + unsigned char* A_bytes = (unsigned char*) A_avx; + + // load A in the tail of AVX2 registers + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) + { + A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ]; + } + } + + // pivot row is secret, pivot col is not + unsigned char inverse; + int pivot_row = 0; + int pivot_col = MAYO_MAX(MAX_COLS - ncols,0); + for (; pivot_col < MAX_COLS-128; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-96; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-64; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS-32; pivot_col++) { + #include "echelon_form_loop.h" + } + for (; pivot_col < MAX_COLS; pivot_col++) { + #include "echelon_form_loop.h" + } + + // write the matrix A back + for (int i = 0; i < nrows; i++) { + for (int j = 0; j < ncols; j++) { + A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j]; + } + } + mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32); + mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows); +} + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form_loop.h new file mode 100644 index 0000000000..b8b29741c4 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form_loop.h @@ -0,0 +1,58 @@ +// SPDX-License-Identifier: Apache-2.0 + +int pivot_col_rounded = pivot_col/32; + +int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS); +int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols); +/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/ + +/* zero out pivot row */ +for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) { + _pivot_row[i] = _mm256_set1_epi8(0); +} + +/* try to get a pivot row in constant time */ +unsigned char pivot = 0; +uint32_t pivot_is_zero = -1; +for (int row = pivot_row_lower_bound; + row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) { + uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row); + uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row); + __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) ); + for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j]; + } + pivot = pivot_row_bytes[pivot_col]; + pivot_is_zero = ~ct_compare_32((int) pivot, 0); +} + +/* multiply pivot row by inverse of pivot */ +inverse = inverse_f(pivot); +__m256i inverse_multab = tbl32_gf16_multab(inverse); + +for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]); +} + +/* conditionally write pivot row to the correct row, if there is a nonzero pivot */ +/* eliminate entries below pivot */ +for (int row = pivot_row_lower_bound; row < nrows; row++) { + unsigned char below_pivot = (unsigned char) (ct_is_greater_than(row, pivot_row)); + unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col]; + + __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim); + if (row <= pivot_row_upper_bound) { + __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero); + for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) { + A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^ + _mm256_shuffle_epi8(multab, _pivot_row[col]); + } + } else { + for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) { + A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]); + } + } +} + +pivot_row += (-(int32_t)(~pivot_is_zero)); + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.c new file mode 100644 index 0000000000..ce1ccd4c71 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define PK_PRF AES_128_CTR + +static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) { + int i; + for (i = 0; i < mdeclen / 2; ++i) { + *mdec++ = m[i] & 0xf; + *mdec++ = m[i] >> 4; + } + + if (mdeclen % 2 == 1) { + *mdec++ = m[i] & 0x0f; + } +} + +static void encode(const unsigned char *m, unsigned char *menc, int mlen) { + int i; + for (i = 0; i < mlen / 2; ++i, m += 2) { + menc[i] = (*m) | (*(m + 1) << 4); + } + + if (mlen % 2 == 1) { + menc[i] = (*m); + } +} + +static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *vPv = _vPv; + uint64_t temp[M_MAX/16] = {0}; + unsigned char *temp_bytes = (unsigned char *) temp; + int k = 0; + for (int i = PARAM_k(p) - 1; i >= 0 ; i--) { + for (int j = i; j < PARAM_k(p); j++) { + // multiply by X (shift up 4 bits) + unsigned char top = temp[k] >> 60; + temp[k] <<= 4; + k--; + for(; k>=0; k--){ + temp[k+1] ^= temp[k] >> 60; + temp[k] <<= 4; + } + // reduce mod f(X) + for (int jj = 0; jj < F_TAIL_LEN; jj++) { + if(jj%2 == 0){ +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#endif + } + else { +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#endif + } + } + + // extract from vPv and add + for(k=0; k < PARAM_m(p)/16; k ++){ + temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]); + } + k--; + } + } + + // add to y + for (int i = 0; i < PARAM_m(p); i+=2) + { +#ifdef TARGET_BIG_ENDIAN + y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + y[i] = t[i] ^ (temp_bytes[i/2] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4); +#endif + + } +} + +static void transpose_16x16_nibbles(uint64_t *M){ + static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f; + static const uint64_t even_bytes = 0x00ff00ff00ff00ff; + static const uint64_t even_2bytes = 0x0000ffff0000ffff; + static const uint64_t even_half = 0x00000000ffffffff; + + for (size_t i = 0; i < 16; i+=2) + { + uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; + M[i ] ^= t << 4; + M[i+1] ^= t; + } + + for (size_t i = 0; i < 16; i+=4) + { + uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes; + uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; + M[i ] ^= (t0 << 8); + M[i+1] ^= (t1 << 8); + M[i+2] ^= t0; + M[i+3] ^= t1; + } + + for (size_t i = 0; i < 4; i++) + { + uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes; + uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes; + + M[i ] ^= t0 << 16; + M[i+ 8] ^= t1 << 16; + M[i+ 4] ^= t0; + M[i+12] ^= t1; + } + + for (size_t i = 0; i < 8; i++) + { + uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; + M[i ] ^= t << 32; + M[i+8] ^= t; + } +} + +#define MAYO_M_OVER_8 ((M_MAX + 7) / 8) + +static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *VtL = _VtL; + int bits_to_shift = 0; + int words_to_shift = 0; + uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0}; + size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16; + const uint64_t *Mi, *Mj; + + for (int i = 0; i <= PARAM_k(p) - 1; ++i) { + for (int j = PARAM_k(p) - 1; j >= i; --j) { + // add the M_i and M_j to A, shifted "down" by l positions + Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + + if (i != j) { + Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + } + + bits_to_shift += 4; + if(bits_to_shift == 64){ + words_to_shift ++; + bits_to_shift = 0; + } + } + } + + for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16) + { + transpose_16x16_nibbles(A + c); + } + + unsigned char tab[F_TAIL_LEN*4] = {0}; + for (size_t i = 0; i < F_TAIL_LEN; i++) + { + tab[4*i] = mul_f(PARAM_f_tail(p)[i],1); + tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2); + tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4); + tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8); + } + + uint64_t low_bit_in_nibble = 0x1111111111111111; + + for (size_t c = 0; c < A_width; c+= 16) + { + for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++) + { + size_t pos = (r/16)*A_width + c + (r%16); + uint64_t t0 = A[pos] & low_bit_in_nibble; + uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble; + uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble; + uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble; + for (size_t t = 0; t < F_TAIL_LEN; t++) + { + A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3]; + } + } + } + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) + A[i] = BSWAP64(A[i]); +#endif + + for (int r = 0; r < PARAM_m(p); r+=16) + { + for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16) + { + for (size_t i = 0; i < 16; i++) + { + decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c)); + } + } + } +} + +// Public API + +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) { + int ret = 0; + + ret = mayo_keypair_compact(p, pk, sk); + if (ret != MAYO_OK) { + goto err; + } + +err: + return ret; +} + +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data + unsigned char y[M_MAX]; // secret data + unsigned char salt[SALT_BYTES_MAX]; // not secret data + unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX], + Vdec[N_MINUS_O_MAX * K_MAX]; // secret data + unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data + unsigned char x[K_MAX * N_MAX]; // not secret data + unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data + unsigned char s[K_MAX * N_MAX]; // not secret data + const unsigned char *seed_sk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data + alignas(32) sk_t sk; // secret data + unsigned char Ox[N_MINUS_O_MAX]; // secret data + // unsigned char Mdigest[DIGEST_BYTES]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1]; + unsigned char *ctrbyte; + unsigned char *vi; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_v_bytes = PARAM_v_bytes(p); + const int param_r_bytes = PARAM_r_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P2_bytes = PARAM_P2_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_A_cols = PARAM_A_cols(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + ret = mayo_expand_sk(p, csk, &sk); + if (ret != MAYO_OK) { + goto err; + } + + seed_sk = csk; + decode(sk.o, O, (param_n - param_o) * param_o); + + // hash message + shake256(tmp, param_digest_bytes, m, mlen); + + uint64_t *P1 = sk.p; + uint64_t *L = P1 + (param_P1_bytes/8); + alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0}; + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + L[i] = BSWAP64(L[i]); + } +#endif + + // choose the randomizer + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(tmp + param_digest_bytes, param_salt_bytes); + #else + if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // hashing to salt + memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk, + param_sk_seed_bytes); + shake256(salt, param_salt_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret +#endif + + // hashing to t + memcpy(tmp + param_digest_bytes, salt, param_salt_bytes); + ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes; + + shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + + decode(tenc, t, param_m); // may not be necessary + + for (int ctr = 0; ctr <= 255; ++ctr) { + *ctrbyte = (unsigned char)ctr; + + shake256(V, param_k * param_v_bytes + param_r_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1); + + // decode the v_i vectors + for (int i = 0; i <= param_k - 1; ++i) { + decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o), + param_n - param_o); + } + + // compute all the V * L matrices. + // compute all the V * P1 * V^T matrices. + alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0}; + V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y); + + compute_rhs(p, Y, t, y); + compute_A(p, Mtmp, A); + + decode(V + param_k * param_v_bytes, r, + param_k * + param_o); + if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) { + break; + } else { + memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t)); + } + } + + // s is already 0 + // TODO: optimize this? + for (int i = 0; i <= param_k - 1; ++i) { + vi = Vdec + i * (param_n - param_o); + mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1); + mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1); + memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o); + } + encode(s, sig, param_n * param_k); + memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes); + *siglen = param_sig_bytes; +err: + mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX); + mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX); + mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1)); + mayo_secure_clear(r, K_MAX * O_MAX + 1); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(&sk, sizeof(sk_t)); + mayo_secure_clear(Ox, N_MINUS_O_MAX); + mayo_secure_clear(tmp, + DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1); + return ret; +} + +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + const int param_sig_bytes = PARAM_sig_bytes(p); + size_t siglen = param_sig_bytes; + ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk); + if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes) + goto err; + + memmove(sm + param_sig_bytes, m, mlen); + *smlen = siglen + mlen; +err: + return ret; +} + +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk) { + const int param_sig_bytes = PARAM_sig_bytes(p); + if (smlen < (size_t)param_sig_bytes) { + return MAYO_ERR; + } + int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm, + pk); + + if (result == MAYO_OK) { + *mlen = smlen - param_sig_bytes; + memmove(m, sm + param_sig_bytes, *mlen); + } + + return result; +} + +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk) { + int ret = MAYO_OK; + unsigned char *seed_sk = csk; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8]; + alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0}; + + unsigned char *seed_pk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_m = PARAM_m(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + // seed_sk $←- B^(sk_seed bytes) + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(seed_sk, param_sk_seed_bytes); + #else + if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // S ← shake256(seedsk, pk seed bytes + O bytes) + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + // seed_pk ← s[0 : pk_seed_bytes] + seed_pk = S; + + // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes]) + decode(S + param_pk_seed_bytes, O, param_v * param_o); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + + int m_legs = param_m / 32; + + uint64_t *P1 = P; + uint64_t *P1O_P2 = P + (param_P1_bytes / 8); + + // compute P3 = O^t * (P1*O + P2) + Ot_times_P1O_P2(p, P1, O, P1O_P2, P3); + + // store seed_pk in cpk + memcpy(cpk, seed_pk, param_pk_seed_bytes); + + alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8]; + + // compute Upper(P3) and store in cpk + m_upper(m_legs, P3, P3_upper, param_o); + + memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes); + + +#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL) +err: +#endif + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2); + return ret; +} + +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *pk) { + #ifdef MAYO_VARIANT + (void)p; + #endif + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes); + pk += param_P1_bytes + param_P2_bytes; + memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes); + return MAYO_OK; +} + +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *sk) { + int ret = MAYO_OK; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + uint64_t *P = sk->p; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_o = PARAM_o(p); + const int param_v = PARAM_v(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + const unsigned char *seed_sk = csk; + unsigned char *seed_pk = S; + + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + decode(S + param_pk_seed_bytes, O, + param_v * param_o); // O = S + PK_SEED_BYTES; + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + uint64_t *P2 = P + (param_P1_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + uint64_t *P1 = P; + // compute L_i = (P1 + P1^t)*O + P2 + uint64_t *L = P2; + P1P1t_times_O(p, P1, O, L); + + // write to sk + memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + return ret; +} + +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *cpk) { + unsigned char tEnc[M_BYTES_MAX]; + unsigned char t[M_MAX]; + unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X) + unsigned char s[K_MAX * N_MAX]; + alignas (64) uint64_t pk[EPK_BYTES_MAX / 8]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX]; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P3_bytes = PARAM_P3_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk); + if (ret != MAYO_OK) { + return MAYO_ERR; + } + + uint64_t *P1 = pk; + uint64_t *P2 = pk + (param_P1_bytes / 8); + uint64_t *P3 = P2 + (param_P2_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + P2[i] = BSWAP64(P2[i]); + } + for (int i = 0; i < param_P3_bytes / 8; ++i) { + P3[i] = BSWAP64(P3[i]); + } +#endif + + // hash m + shake256(tmp, param_digest_bytes, m, mlen); + + // compute t + memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes, + param_salt_bytes); + shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + decode(tEnc, t, param_m); + + // decode s + decode(sig, s, param_k * param_n); + + // Compute S*P*S^T + alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0}; + + m_calculate_PS_SPS(P1, P2, P3, s, param_m, + param_v, param_o, param_k, SPS); + + // combine the vectors in SPS and reduce mod f(X) + compute_rhs(p, SPS, y, y); + + if (memcmp(y, t, param_m) == 0) { + return MAYO_OK; // good signature + } + return MAYO_ERR; // bad signature +} + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.h new file mode 100644 index 0000000000..1de4bf2a33 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.h @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MAYO_H +#define MAYO_H + +#include +#include + +#define F_TAIL_LEN 5 +#define F_TAIL_64 \ + { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3 +#define F_TAIL_96 \ + { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x +#define F_TAIL_128 \ + { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2 + +#define MAYO_1_name "MAYO_1" +#define MAYO_1_n 66 +#define MAYO_1_m 64 +#define MAYO_1_o 8 +#define MAYO_1_v (MAYO_1_n - MAYO_1_o) +#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1) +#define MAYO_1_k 9 +#define MAYO_1_q 16 +#define MAYO_1_m_bytes 32 +#define MAYO_1_O_bytes 232 +#define MAYO_1_v_bytes 29 +#define MAYO_1_r_bytes 36 +#define MAYO_1_P1_bytes 54752 +#define MAYO_1_P2_bytes 14848 +#define MAYO_1_P3_bytes 1152 +#define MAYO_1_csk_bytes 24 +#define MAYO_1_esk_bytes 69856 +#define MAYO_1_cpk_bytes 1168 +#define MAYO_1_epk_bytes 70752 +#define MAYO_1_sig_bytes 321 +#define MAYO_1_f_tail F_TAIL_64 +#define MAYO_1_f_tail_arr f_tail_64 +#define MAYO_1_salt_bytes 24 +#define MAYO_1_digest_bytes 32 +#define MAYO_1_pk_seed_bytes 16 +#define MAYO_1_sk_seed_bytes 24 + +#define MAYO_2_name "MAYO_2" +#define MAYO_2_n 78 +#define MAYO_2_m 64 +#define MAYO_2_o 18 +#define MAYO_2_v (MAYO_2_n - MAYO_2_o) +#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1) +#define MAYO_2_k 4 +#define MAYO_2_q 16 +#define MAYO_2_m_bytes 32 +#define MAYO_2_O_bytes 540 +#define MAYO_2_v_bytes 30 +#define MAYO_2_r_bytes 36 +#define MAYO_2_P1_bytes 58560 +#define MAYO_2_P2_bytes 34560 +#define MAYO_2_P3_bytes 5472 +#define MAYO_2_csk_bytes 24 +#define MAYO_2_esk_bytes 93684 +#define MAYO_2_cpk_bytes 5488 +#define MAYO_2_epk_bytes 98592 +#define MAYO_2_sig_bytes 180 +#define MAYO_2_f_tail F_TAIL_64 +#define MAYO_2_f_tail_arr f_tail_64 +#define MAYO_2_salt_bytes 24 +#define MAYO_2_digest_bytes 32 +#define MAYO_2_pk_seed_bytes 16 +#define MAYO_2_sk_seed_bytes 24 + +#define MAYO_3_name "MAYO_3" +#define MAYO_3_n 99 +#define MAYO_3_m 96 +#define MAYO_3_o 10 +#define MAYO_3_v (MAYO_3_n - MAYO_3_o) +#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1) +#define MAYO_3_k 11 +#define MAYO_3_q 16 +#define MAYO_3_m_bytes 48 +#define MAYO_3_O_bytes 445 +#define MAYO_3_v_bytes 45 +#define MAYO_3_r_bytes 55 +#define MAYO_3_P1_bytes 192240 +#define MAYO_3_P2_bytes 42720 +#define MAYO_3_P3_bytes 2640 +#define MAYO_3_csk_bytes 32 +#define MAYO_3_esk_bytes 235437 +#define MAYO_3_cpk_bytes 2656 +#define MAYO_3_epk_bytes 237600 +#define MAYO_3_sig_bytes 577 +#define MAYO_3_f_tail F_TAIL_96 +#define MAYO_3_f_tail_arr f_tail_96 +#define MAYO_3_salt_bytes 32 +#define MAYO_3_digest_bytes 48 +#define MAYO_3_pk_seed_bytes 16 +#define MAYO_3_sk_seed_bytes 32 + +#define MAYO_5_name "MAYO_5" +#define MAYO_5_n 133 +#define MAYO_5_m 128 +#define MAYO_5_o 12 +#define MAYO_5_v (MAYO_5_n - MAYO_5_o) +#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1) +#define MAYO_5_k 12 +#define MAYO_5_q 16 +#define MAYO_5_m_bytes 64 +#define MAYO_5_O_bytes 726 +#define MAYO_5_v_bytes 61 +#define MAYO_5_r_bytes 72 +#define MAYO_5_P1_bytes 472384 +#define MAYO_5_P2_bytes 92928 +#define MAYO_5_P3_bytes 4992 +#define MAYO_5_csk_bytes 40 +#define MAYO_5_esk_bytes 566078 +#define MAYO_5_cpk_bytes 5008 +#define MAYO_5_epk_bytes 570304 +#define MAYO_5_sig_bytes 838 +#define MAYO_5_f_tail F_TAIL_128 +#define MAYO_5_f_tail_arr f_tail_128 +#define MAYO_5_salt_bytes 40 +#define MAYO_5_digest_bytes 64 +#define MAYO_5_pk_seed_bytes 16 +#define MAYO_5_sk_seed_bytes 40 + +#define PARAM_JOIN2_(a, b) a##_##b +#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b) +#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end) + +#if defined(MAYO_VARIANT) +#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c +#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c) +#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s) + +#if defined(MAYO_BUILD_TYPE_REF) +#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s) +#elif defined(MAYO_BUILD_TYPE_OPT) +#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s) +#elif defined(MAYO_BUILD_TYPE_AVX2) +#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s) +#else +#error "Build type not known" +#endif + +#else +#define MAYO_NAMESPACE(s) s +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define NAME_MAX mayo5 +#define N_MAX 133 +#define M_MAX 128 +#define O_MAX 18 +#define V_MAX 121 +#define K_MAX 12 +#define Q_MAX 16 +#define PK_SEED_BYTES_MAX 16 +#define SK_SEED_BYTES_MAX 40 +#define SALT_BYTES_MAX 40 +#define DIGEST_BYTES_MAX 64 +#define O_BYTES_MAX 726 +#define V_BYTES_MAX 61 +#define R_BYTES_MAX 72 +#define P1_BYTES_MAX 472384 +#define P2_BYTES_MAX 92928 +#define P3_BYTES_MAX 5472 +#define SIG_BYTES_MAX 838 +#define CPK_BYTES_MAX 5488 +#define CSK_BYTES_MAX 40 +#define ESK_BYTES_MAX 566078 +#define EPK_BYTES_MAX 570304 +#define N_MINUS_O_MAX 121 +#define M_BYTES_MAX 64 +#elif defined(MAYO_VARIANT) +#define M_MAX PARAM_NAME(m) +#define N_MAX PARAM_NAME(n) +#define O_MAX PARAM_NAME(o) +#define V_MAX PARAM_NAME(v) +#define K_MAX PARAM_NAME(k) +#define Q_MAX PARAM_NAME(q) +#define M_BYTES_MAX PARAM_NAME(m_bytes) +#define O_BYTES_MAX PARAM_NAME(O_bytes) +#define V_BYTES_MAX PARAM_NAME(v_bytes) +#define R_BYTES_MAX PARAM_NAME(r_bytes) +#define P1_BYTES_MAX PARAM_NAME(P1_bytes) +#define P2_BYTES_MAX PARAM_NAME(P2_bytes) +#define P3_BYTES_MAX PARAM_NAME(P3_bytes) +#define SIG_BYTES_MAX PARAM_NAME(sig_bytes) +#define CSK_BYTES_MAX PARAM_NAME(csk_bytes) +#define ESK_BYTES_MAX PARAM_NAME(esk_bytes) +#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes) +#define EPK_BYTES_MAX PARAM_NAME(epk_bytes) +#define N_MINUS_O_MAX (N_MAX - O_MAX) +#define SALT_BYTES_MAX PARAM_NAME(salt_bytes) +#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes) +#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes) +#define SK_SEED_BYTES_MAX SALT_BYTES_MAX +#else +#error "Parameter not specified" +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define PARAM_name(p) (p->name) +#define PARAM_m(p) (p->m) +#define PARAM_n(p) (p->n) +#define PARAM_o(p) (p->o) +#define PARAM_v(p) (p->n - p->o) +#define PARAM_A_cols(p) (p->k * p->o + 1) +#define PARAM_k(p) (p->k) +#define PARAM_q(p) (p->q) +#define PARAM_m_bytes(p) (p->m_bytes) +#define PARAM_O_bytes(p) (p->O_bytes) +#define PARAM_v_bytes(p) (p->v_bytes) +#define PARAM_r_bytes(p) (p->r_bytes) +#define PARAM_P1_bytes(p) (p->P1_bytes) +#define PARAM_P2_bytes(p) (p->P2_bytes) +#define PARAM_P3_bytes(p) (p->P3_bytes) +#define PARAM_csk_bytes(p) (p->csk_bytes) +#define PARAM_esk_bytes(p) (p->esk_bytes) +#define PARAM_cpk_bytes(p) (p->cpk_bytes) +#define PARAM_epk_bytes(p) (p->epk_bytes) +#define PARAM_sig_bytes(p) (p->sig_bytes) +#define PARAM_f_tail(p) (p->f_tail) +#define PARAM_salt_bytes(p) (p->salt_bytes) +#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes) +#define PARAM_digest_bytes(p) (p->digest_bytes) +#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes) +#elif defined(MAYO_VARIANT) +#define PARAM_name(p) PARAM_NAME(name) +#define PARAM_m(p) PARAM_NAME(m) +#define PARAM_n(p) PARAM_NAME(n) +#define PARAM_o(p) PARAM_NAME(o) +#define PARAM_v(p) PARAM_NAME(v) +#define PARAM_A_cols(p) PARAM_NAME(A_cols) +#define PARAM_k(p) PARAM_NAME(k) +#define PARAM_q(p) PARAM_NAME(q) +#define PARAM_m_bytes(p) PARAM_NAME(m_bytes) +#define PARAM_O_bytes(p) PARAM_NAME(O_bytes) +#define PARAM_v_bytes(p) PARAM_NAME(v_bytes) +#define PARAM_r_bytes(p) PARAM_NAME(r_bytes) +#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes) +#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes) +#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes) +#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes) +#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes) +#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes) +#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes) +#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes) +static const unsigned char f_tail[] = PARAM_NAME(f_tail); +#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes) +#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes) +#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes) +#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes) +#define PARAM_f_tail(p) f_tail +#else +#error "Parameter not specified" +#endif + +/** + * Struct defining MAYO parameters + */ +typedef struct { + int m; + int n; + int o; + int k; + int q; + const unsigned char *f_tail; + int m_bytes; + int O_bytes; + int v_bytes; + int r_bytes; + int R_bytes; + int P1_bytes; + int P2_bytes; + int P3_bytes; + int csk_bytes; + int esk_bytes; + int cpk_bytes; + int epk_bytes; + int sig_bytes; + int salt_bytes; + int sk_seed_bytes; + int digest_bytes; + int pk_seed_bytes; + const char *name; +} mayo_params_t; + +typedef struct sk_t { + uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8]; + uint8_t o[O_BYTES_MAX]; +} sk_t; + +/** + * MAYO parameter sets + */ +#ifdef ENABLE_PARAMS_DYNAMIC +extern const mayo_params_t MAYO_1; +extern const mayo_params_t MAYO_2; +extern const mayo_params_t MAYO_3; +extern const mayo_params_t MAYO_5; +#endif + +/** + * Status codes + */ +#define MAYO_OK 0 +#define MAYO_ERR 1 + +/** + * Mayo keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * @param[in] p Mayo parameter set + * @param[out] pk Mayo public key + * @param[out] sk Mayo secret key + * @return int status code + */ +#define mayo_keypair MAYO_NAMESPACE(mayo_keypair) +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk); + +#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature) +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk); + +/** + * MAYO signature generation. + * + * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec. + * Keys provided is a compacted secret keys. + * The caller is responsible to allocate sufficient memory to hold sm. + * + * @param[in] p Mayo parameter set + * @param[out] sm Signature concatenated with message + * @param[out] smlen Pointer to the length of sm + * @param[in] m Message to be signed + * @param[in] mlen Message length + * @param[in] sk Compacted secret key + * @return int status code + */ +#define mayo_sign MAYO_NAMESPACE(mayo_sign) +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +/** + * Mayo open signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m. + * Keys provided is a compact public key. + * The caller is responsible to allocate sufficient memory to hold m. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sm Signature concatenated with message + * @param[in] smlen Length of sm + * @param[in] pk Compacted public key + * @return int status code + */ +#define mayo_open MAYO_NAMESPACE(mayo_open) +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk); + +/** + * Mayo compact keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and + * cpk are compact representations of a Mayo secret key and public key + * + * @param[in] p Mayo parameter set + * @param[out] cpk Mayo compacted public key + * @param[out] csk Mayo compacted secret key + * @return int status code + */ +#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact) +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk); + +/** + * Mayo expand public key. + * + * The implementation corresponds to Mayo.expandPK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold epk. + * + * @param[in] p Mayo parameter set + * @param[in] cpk Compacted public key. + * @param[out] epk Expanded public key. + * @return int return code + */ +#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk) +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *epk); + +/** + * Mayo expand secret key. + * + * The implementation corresponds to Mayo.expandSK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold esk. + * + * @param[in] p Mayo parameter set + * @param[in] csk Compacted secret key. + * @param[out] esk Expanded secret key. + * @return int return code + */ +#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk) +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *esk); + +/** + * Mayo verify signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1. + * Keys provided is a compact public key. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sig Signature + * @param[in] pk Compacted public key + * @return int 0 if verification succeeded, 1 otherwise. + */ +#define mayo_verify MAYO_NAMESPACE(mayo_verify) +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *pk); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/mem.h b/src/sig/mayo/pqmayo_mayo-5_avx2/mem.h new file mode 100644 index 0000000000..2aa2196e2e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/mem.h @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MEM_H +#define MEM_H +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#define BSWAP32(i) __builtin_bswap32((i)) +#define BSWAP64(i) __builtin_bswap64((i)) +#else +#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24)) +#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)) +#endif + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint32_t ct_is_greater_than(int a, int b) { + int32_t diff = b - a; + return (uint32_t) (diff >> (8*sizeof(uint32_t)-1)); +} + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint64_t ct_64_is_greater_than(int a, int b) { + int64_t diff = ((int64_t) b) - ((int64_t) a); + return (uint64_t) (diff >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00000000, else 0xFFFFFFFF +static inline uint32_t ct_compare_32(int a, int b) { + return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF +static inline uint64_t ct_compare_64(int a, int b) { + return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00, else 0xFF +static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) { + return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +#include +/** + * Clears and frees allocated memory. + * + * @param[out] mem Memory to be cleared and freed. + * @param size Size of memory to be cleared and freed. + */ +static inline void mayo_secure_free(void *mem, size_t size) { + OQS_MEM_secure_free(mem, size); +} + +/** + * Clears memory. + * + * @param[out] mem Memory to be cleared. + * @param size Size of memory to be cleared. + */ +static inline void mayo_secure_clear(void *mem, size_t size) { + OQS_MEM_cleanse(mem, size); +} + +#endif diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/params.c b/src/sig/mayo/pqmayo_mayo-5_avx2/params.c new file mode 100644 index 0000000000..043d4cedc1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/params.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +static const unsigned char f_tail_64[] = F_TAIL_64; +static const unsigned char f_tail_96[] = F_TAIL_96; +static const unsigned char f_tail_128[] = F_TAIL_128; + +#define MAYO_GEN_PARAMS(nm) \ + const mayo_params_t nm = { \ + .m = PARAM_JOIN2(nm, m), \ + .n = PARAM_JOIN2(nm, n), \ + .o = PARAM_JOIN2(nm, o), \ + .k = PARAM_JOIN2(nm, k), \ + .q = PARAM_JOIN2(nm, q), \ + .f_tail = PARAM_JOIN2(nm, f_tail_arr), \ + .m_bytes = PARAM_JOIN2(nm, m_bytes), \ + .O_bytes = PARAM_JOIN2(nm, O_bytes), \ + .v_bytes = PARAM_JOIN2(nm, v_bytes), \ + .r_bytes = PARAM_JOIN2(nm, r_bytes), \ + .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \ + .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \ + .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \ + .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \ + .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \ + .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \ + .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \ + .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \ + .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \ + .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \ + .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \ + .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \ + .name = #nm \ + }; + +MAYO_GEN_PARAMS(MAYO_1); +MAYO_GEN_PARAMS(MAYO_2); +MAYO_GEN_PARAMS(MAYO_3); +MAYO_GEN_PARAMS(MAYO_5); +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_128.h new file mode 100644 index 0000000000..27b416adce --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_128.h @@ -0,0 +1,524 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_128_H +#define SHUFFLE_ARITHMETIC_128_H + +#include +#include +#include +#include + + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + cols_used ++; + __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[(2*r*O_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); + acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0; + acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1; + } + } +} + +static +inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){ + const __m256i *P1O_P2 = (__m256i *) _P1O_P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); + acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1; + } + } +} + + +static +inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + cols_used += 1; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + pos += (V_MAX -c - 1); + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k )); + __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1)); + __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2)); + __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3)); + + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k), acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1); + } + } +} + + +static +inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *L = (__m256i *) _L; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +static +inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + +static +inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *Pv = (__m256i *) _Pv; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + + // P1 * S1 + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + P1_cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1); + } + } + + // P2 * S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P3 = (__m256i *) _P3; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + + for (c=r; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +static +inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc); +} + +static +inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){ + const __m256i *PS2 = (__m256i *) _PS2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c); + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1); + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0; + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); + acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1; + } +#if K_MAX % 2 == 1 + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4); + + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask; + acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4); +#endif + } +} + + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_64.h new file mode 100644 index 0000000000..defff86f8f --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_64.h @@ -0,0 +1,481 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_64_H +#define SHUFFLE_ARITHMETIC_64_H + +#include +#include +#include +#include + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t; + } + } +} + + +static +inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){ + + const __m256i *P1O_P2 = (__m256i *) _P1O_P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t; + } + } +} + +static +inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){ + + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[O_MAX] = {0}; + cols_used += 1; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + pos); + pos += (V_MAX -c - 1); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd); + temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even); + } + } + + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k )); + __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1)); + + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + + _mm256_storeu_si256(acc + (r*O_MAX + k ), acc0 ^ temp[k ] ^ _mm256_slli_epi16(t,4)); + _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t); + } + } +} + + +static +inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *L = (__m256i *) _L; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +static +inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){ + + const __m256i *Pv = (__m256i *) _Pv; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +static +inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular +// same as mayo_12_P1_times_Vt_avx2 +static +inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc); +} + +static +inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){ + mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc); +} + +static +inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){ + const __m256i *PS2 = (__m256i *) _PS2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + size_t k; + + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=0; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P2 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P1 = (__m256i *) _P1; + const __m256i *P2 = (__m256i *) _P2; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + + // P1 * S1 + for (c=r; c < V_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + P1_cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even); + } + } + + // P2 * S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){ + size_t k,c; + const __m256i *P3 = (__m256i *) _P3; + __m256i *acc = (__m256i *) _acc; + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2] = {0}; + + for (c=r; c < O_MAX; c++) + { + __m256i in_odd = _mm256_loadu_si256(P3 + cols_used); + __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask; + in_odd &= low_nibble_mask; + cols_used ++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd); + temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k + 1 < K_MAX; k+=2) + { + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); + acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t; + } +#if K_MAX % 2 == 1 + __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask; + acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4); +#endif + } +} + + +static inline +void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { + (void) size; + int m_vecs_stored = 0; + + for (int r = 0; r < O_MAX; ++r) { + const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r)); + __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored); + _out[0] = _in[0]; + m_vecs_stored++; + for (int c = r + 1; c < O_MAX; ++c) { + const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c)); + const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r)); + _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored); + _out[0] = _in2[0] ^ _in3[0]; + m_vecs_stored++; + } + } +} + + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_96.h new file mode 100644 index 0000000000..9b3a69d567 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_96.h @@ -0,0 +1,550 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SHUFFLE_ARITHMETIC_96_H +#define SHUFFLE_ARITHMETIC_96_H + +#include +#include +#include +#include + + +// P1*0 -> P1: v x v, O: v x o +static +inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + cols_used ++; + // do multiplications for one row and accumulate results in temporary format + __m256i temp[2*O_MAX] = {0}; + size_t pos = r; + for (size_t c = 0; c < r; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + pos += (V_MAX -c - 1); + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + for (size_t c = r+1; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used ++; + + for (size_t k = 0; k < O_MAX; k+=2) + { + temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); + temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0); + temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1); + temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1); + } + } + + // convert to normal format and add to accumulator + for (size_t k = 0; k < O_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } + } +} + +static +inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){ + + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < O_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (c = r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < V_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +// P2*S2 -> P2: v x o, S2: o x k +static +inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t P1_cols_used = 0; + for (size_t r = 0; r < V_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + // P1 times S1 + __m256i temp[K_OVER_2*2*2] = {0}; + for (c=r; c < V_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + P1_cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1); + } + } + + // P2 times S2 + for (c=0; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular +static +inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k,c; + size_t cols_used = 0; + for (size_t r = 0; r < O_MAX; r++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (c=r; c < O_MAX; c++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + cols_used++; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +static +inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){ + mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc); +} + +static +inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){ + const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f); + + size_t k; + for (size_t c = 0; c < K_MAX; c++) + { + // do multiplications for one row and accumulate results in temporary format + __m256i temp[K_OVER_2*2*2] = {0}; + for (size_t r = 0; r < O_MAX; r++) + { + __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements + __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask; + in_odd0 &= low_nibble_mask; + __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) + __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask; + in_odd1 &= low_nibble_mask; + + for (k = 0; k < K_OVER_2; k++) + { + temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0); + temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0); + temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1); + temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1); + } + } + + // convert to normal format and add to accumulator + for (k = 0; k+1 < K_MAX; k+=2) + { + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6)); + __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6), acc2 ^ temp[2*k + 1] ^ t0); + _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1); + } +#if K_MAX % 2 == 1 + __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6)); + __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2)); + + __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask; + __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask; + + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4)); + _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4)); +#endif + } +} + +#undef K_OVER_2 +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-5_avx2/simple_arithmetic.h new file mode 100644 index 0000000000..ce7580f4c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_avx2/simple_arithmetic.h @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SIMPLE_ARITHMETIC_H +#define SIMPLE_ARITHMETIC_H + +// GF(16) multiplication mod x^4 + x + 1 +static inline unsigned char mul_f(unsigned char a, unsigned char b) { + // carryless multiply + unsigned char p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + unsigned char top_p = p & 0xf0; + unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f; + return out; +} + +static inline uint64_t mul_fx8(unsigned char a, uint64_t b) { + // carryless multiply + uint64_t p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0; + uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f; + return out; +} + +// GF(16) addition +static inline unsigned char add_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) subtraction +static inline unsigned char sub_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) negation +static inline unsigned char neg_f(unsigned char a) { + return a; +} + +static inline unsigned char inverse_f(unsigned char a) { + // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5, + // 10, 4, 3, 8}; return table[a & 15]; + + unsigned char a2 = mul_f(a, a); + unsigned char a4 = mul_f(a2, a2); + unsigned char a8 = mul_f(a4, a4); + unsigned char a6 = mul_f(a2, a4); + unsigned char a14 = mul_f(a8, a6); + + return a14; +} + +static inline unsigned char lincomb(const unsigned char *a, + const unsigned char *b, int n, int m) { + unsigned char ret = 0; + for (int i = 0; i < n; ++i, b += m) { + ret = add_f(mul_f(a[i], *b), ret); + } + return ret; +} + +static inline void mat_mul(const unsigned char *a, const unsigned char *b, + unsigned char *c, int colrow_ab, int row_a, int col_b) { + for (int i = 0; i < row_a; ++i, a += colrow_ab) { + for (int j = 0; j < col_b; ++j, ++c) { + *c = lincomb(a, b + j, colrow_ab, col_b); + } + } +} + +static inline void mat_add(const unsigned char *a, const unsigned char *b, + unsigned char *c, int m, int n) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j)); + } + } +} + +static +inline void m_vec_copy(int m_legs, const uint64_t *in, + uint64_t *out) { + for (int i = 0; i < m_legs * 2; i++) { + out[i] = in[i]; + } +} + +static +inline void m_vec_add(int m_legs, const uint64_t *in, + uint64_t *acc) { + for (int i = 0; i < m_legs * 2; i++) { + acc[i] ^= in[i]; + } +} + +// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1) +// gf16 := gf2[x]/(x^4+x+1) + +static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) { + uint32_t a_msb; + uint32_t a32 = a; + uint32_t b32 = b; + uint32_t r32 = a32 * (b32 & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 1) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 2) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 3) & 1); + + return r32; + +} + +static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < m_legs*2;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) { + for(int i=0;i +#include + +void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output); +#define AES_ECB_encrypt AES_256_ECB + +#ifdef ENABLE_AESNI +int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen); +#define AES_128_CTR AES_128_CTR_NI +#else +#include +static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen, + const unsigned char *input, size_t inputByteLen) { + (void) inputByteLen; + uint8_t iv[12] = { 0 }; + aes128ctr_prf(output, outputByteLen, input, iv); + return (int) outputByteLen; +} +#endif + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/api.c b/src/sig/mayo/pqmayo_mayo-5_opt/api.c new file mode 100644 index 0000000000..f2e861e9c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/api.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +#define MAYO_PARAMS &MAYO_5 +#else +#define MAYO_PARAMS 0 +#endif + +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk) { + return mayo_keypair(MAYO_PARAMS, pk, sk); +} + +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk) { + return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk); +} + +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk) { + return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk); +} + +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk) { + return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk); +} + +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk) { + if (siglen != CRYPTO_BYTES) + return -1; + return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk); +} + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/api.h b/src/sig/mayo/pqmayo_mayo-5_opt/api.h new file mode 100644 index 0000000000..404d185c08 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/api.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef api_h +#define api_h + +#include + +#define CRYPTO_SECRETKEYBYTES 40 +#define CRYPTO_PUBLICKEYBYTES 5008 +#define CRYPTO_BYTES 838 + +#define CRYPTO_ALGNAME "MAYO-5" + +#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair) +int +crypto_sign_keypair(unsigned char *pk, unsigned char *sk); + +#define crypto_sign MAYO_NAMESPACE(crypto_sign) +int +crypto_sign(unsigned char *sm, size_t *smlen, + const unsigned char *m, size_t mlen, + const unsigned char *sk); + +#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature) +int +crypto_sign_signature(unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open) +int +crypto_sign_open(unsigned char *m, size_t *mlen, + const unsigned char *sm, size_t smlen, + const unsigned char *pk); + +#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify) +int +crypto_sign_verify(const unsigned char *sig, size_t siglen, + const unsigned char *m, size_t mlen, + const unsigned char *pk); + +#endif /* api_h */ + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.c new file mode 100644 index 0000000000..de09954c8a --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.c @@ -0,0 +1,327 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) { +#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64) + mayo12_m_upper(m_legs, in, out, size); +#else + int m_vecs_stored = 0; + for (int r = 0; r < size; r++) { + for (int c = r; c < size; c++) { + m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored ); + if (r != c) { + m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored ); + } + m_vecs_stored ++; + } + } +#endif +} + +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){ +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + (void) p; + mayo_12_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + (void) p; + mayo_3_P1P1t_times_O(P1, O, acc); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + (void) p; + mayo_5_P1P1t_times_O(P1, O, acc); +#else + #ifndef MAYO_VARIANT + const int m_legs = PARAM_m(p) / 32; + #else + (void) p; + #endif + const int param_o = PARAM_o(p); + const int param_v = PARAM_n(p) - PARAM_o(p);; + + int + bs_mat_entries_used = 0; + for (int r = 0; r < param_v; r++) { + for (int c = r; c < param_v; c++) { + if(c==r) { + bs_mat_entries_used += 1; + continue; + } + for (int k = 0; k < param_o; k += 1) { + +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k)); + vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k)); + vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k)); + vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k)); +#else + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k)); + m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k)); +#endif + + } + bs_mat_entries_used += 1; + } + } +#endif +} + +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_12_Vt_times_L_avx2(L, V_multabs, M); + mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_3_Vt_times_L_avx2(L, V_multabs, M); + mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i V_multabs[(K_MAX+1)/2*V_MAX]; + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mayo_V_multabs_avx2(V, V_multabs); + mayo_5_Vt_times_L_avx2(L, V_multabs, M); + mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv); + mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + + alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0}; + mul_add_mat_x_m_mat(param_m / 32, V, L, M, + param_k, param_n - param_o, param_o); + + mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1); + + mul_add_mat_x_m_mat(param_m / 32, V, Pv, + Y, param_k, param_n - param_o, + param_k); +#endif +} + +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) { + (void) p; +#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128 + __m256i O_multabs[O_MAX/2*V_MAX]; + mayo_O_multabs_avx2(O, O_multabs); + mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2); + mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3); +#else + #ifdef MAYO_VARIANT + (void) p; + #endif + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int m_legs = PARAM_m(p) / 32; + mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1); + mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o); +#endif +} + + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ] +// [ P3*S2 = P2 ] +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS) { + (void) m; +#if MAYO_AVX + const int n = o + v; + + /* Old approach which is constant time but doesn't have to be */ + unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K + unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + +#if M_MAX == 64 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 96 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + + mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#elif M_MAX == 128 + __m256i S1_multabs[(K_MAX+1)/2*V_MAX]; + __m256i S2_multabs[(K_MAX+1)/2*O_MAX]; + mayo_S1_multabs_avx2(S1, S1_multabs); + mayo_S2_multabs_avx2(S2, S2_multabs); + mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS); + mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular + + // S^T * PS = S1^t*PS1 + S2^t*PS2 + //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS); + mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS); + mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS); +#else + NOT IMPLEMENTED +#endif +#else + const int n = o + v; + alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 }; + mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS); + mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS); +#endif +} + + +// sample a solution x to Ax = y, with r used as randomness +// require: +// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are +// not important, they will be overwritten by y) in row major order +// - y is a vector with m elements +// - r and x are k*o bytes long +// return: 1 on success, 0 on failure +int sample_solution(const mayo_params_t *p, unsigned char *A, + const unsigned char *y, const unsigned char *r, + unsigned char *x, int k, int o, int m, int A_cols) { + #ifdef MAYO_VARIANT + (void) p; + #endif + unsigned char finished; + int col_upper_bound; + unsigned char correct_column; + + // x <- r + for (int i = 0; i < k * o; i++) { + x[i] = r[i]; + } + + // compute Ar; + unsigned char Ar[M_MAX]; + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = 0; // clear last col of A + } + mat_mul(A, r, Ar, k * o + 1, m, 1); + + // move y - Ar to last column of matrix A + for (int i = 0; i < m; i++) { + A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]); + } + + EF(A, m, k * o + 1); + + // check if last row of A (excluding the last entry of y) is zero + unsigned char full_rank = 0; + for (int i = 0; i < A_cols - 1; i++) { + full_rank |= A[(m - 1) * A_cols + i]; + } + +// It is okay to leak if we need to restart or not +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1); +#endif + + if (full_rank == 0) { + return 0; + } + + // back substitution in constant time + // the index of the first nonzero entry in each row is secret, which makes + // things less efficient + + for (int row = m - 1; row >= 0; row--) { + finished = 0; + col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o); + // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32} + + for (int col = row; col <= col_upper_bound; col++) { + + // Compare two chars in constant time. + // Returns 0x00 if the byte arrays are equal, 0xff otherwise. + correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished; + + unsigned char u = correct_column & A[row * A_cols + A_cols - 1]; + x[col] ^= u; + + for (int i = 0; i < row; i += 8) { + uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8) + ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24) + ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40) + ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56); + + tmp = mul_fx8(u, tmp); + + A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf; + A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf; + A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf; + A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf; + A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf; + A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf; + A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf; + A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf; + } + + finished = finished | correct_column; + } + } + return 1; +} + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.h new file mode 100644 index 0000000000..c2fb4fe334 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.h @@ -0,0 +1,62 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_H +#define ARITHMETIC_H + +#include +#include +#include + +#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__) +#define TARGET_BIG_ENDIAN +#endif + +#if defined(MAYO_AVX) && (M_MAX == 64) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 96) + #include +#endif +#if defined(MAYO_AVX) && (M_MAX == 128) + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96)) + #include + #include +#endif +#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128)) + #include +#endif + +// Calculate P3 = O^T * (P1*O + P2) in KeyGen +#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2) +void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3); + +// Calculate Upper in KeyGen +#define m_upper MAYO_NAMESPACE(m_upper) +void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size); + +// Calculate acc = (P1+P1^T)*O in expand_sk +#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O) +void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc); + +// Calculate M=V*L and Y=V*P1*V^T in Sign +#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt) +void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y); + +// Sample solution in Sign +#define sample_solution MAYO_NAMESPACE(sample_solution) +int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols); + +// Calculate SPS = S*P*S^T in Verify +#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS) +void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *SPS); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_128.h new file mode 100644 index 0000000000..418c308e2f --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_128.h @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_128_H +#define ARITHMETIC_128_H + +#include +#include +#include + +// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_128(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; + out[6] = in[6]; + out[7] = in[7]; +} + + +static +inline void vec_add_128(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; + acc[6] ^= in[6]; + acc[7] ^= in[7]; +} + +inline +static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 8;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 8;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 8;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void multiply_bins_128(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8); + m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8); + m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8); + m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8); + m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8); + m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8); + m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8); + m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8); + m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8); + m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8); + m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8); + m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8); + m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8); + m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8); + vec_copy_128(bins + 8, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_64.h new file mode 100644 index 0000000000..a70b7a3118 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_64.h @@ -0,0 +1,128 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_64_H +#define ARITHMETIC_64_H + +#include +#include + +// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_64(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; +} + +static +inline void vec_add_64(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; +} + +static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) { + uint64_t mask_msb = 0x8888888888888888ULL; + uint64_t a_msb; + uint64_t a64 = a; + uint64_t b32 = b; + uint64_t r64 = a64 * (b32 & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 1) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 2) & 1); + + a_msb = a64 & mask_msb; // MSB, 3rd bits + a64 ^= a_msb; // clear MSB + a64 = (a64 << 1) ^ ((a_msb >> 3) * 3); + r64 ^= (a64) * ((b32 >> 3) & 1); + + return r64; +} + +static inline uint32_t mul_table(uint8_t b){ + uint32_t x = ((uint32_t) b) * 0x08040201; + + uint32_t high_nibble_mask = 0xf0f0f0f0; + + uint32_t high_half = x & high_nibble_mask; + return (x ^ (high_half >> 4) ^ (high_half >> 3)); +} + +static +inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 4;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < legs; i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +inline +static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 4;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 4;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void multiply_bins_64(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4); + m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4); + m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4); + m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4); + m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4); + m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4); + m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4); + m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4); + m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4); + m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4); + m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4); + m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4); + m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4); + m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4); + vec_copy_64(bins + 4, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_96.h new file mode 100644 index 0000000000..a38f89e454 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_96.h @@ -0,0 +1,85 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_96_H +#define ARITHMETIC_96_H + +#include +#include +#include + +// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1) + +static +inline void vec_copy_96(const uint64_t *in, uint64_t *out) { + out[0] = in[0]; + out[1] = in[1]; + out[2] = in[2]; + out[3] = in[3]; + out[4] = in[4]; + out[5] = in[5]; +} + +static +inline void vec_add_96(const uint64_t *in, uint64_t *acc) { + acc[0] ^= in[0]; + acc[1] ^= in[1]; + acc[2] ^= in[2]; + acc[3] ^= in[3]; + acc[4] ^= in[4]; + acc[5] ^= in[5]; +} + +inline +static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) { + uint64_t mask_msb = 0x8888888888888888ULL; + for(int i=0; i < 6;i++){ + uint64_t t = in[i] & mask_msb; + acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3); + } +} + +inline +static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) { + uint64_t mask_lsb = 0x1111111111111111ULL; + for(int i=0; i < 6;i++){ + uint64_t t = in[i] & mask_lsb; + acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9); + } +} + +static +inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) { + uint32_t tab = mul_table(a); + + uint64_t lsb_ask = 0x1111111111111111ULL; + + for(int i=0; i < 6;i++){ + acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff) + ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf) + ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf) + ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf); + } +} + +static +inline void multiply_bins_96(uint64_t *bins, uint64_t *out) { + + m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6); + m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6); + m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6); + m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6); + m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6); + m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6); + m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6); + m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6); + m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6); + m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6); + m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6); + m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6); + m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6); + m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6); + vec_copy_96(bins + 6, out); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_common.h new file mode 100644 index 0000000000..d337bc238c --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_common.h @@ -0,0 +1,469 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ARITHMETIC_COMMON_H +#define ARITHMETIC_COMMON_H + +#include + +#ifndef MAYO_VARIANT +static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) { + + m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2); + m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 3 * m_legs * 2); + + m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 6 * m_legs * 2); + + m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2); + m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 7 * m_legs * 2); + + m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 4 * m_legs * 2); + + m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 9 * m_legs * 2); + m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 8 * m_legs * 2); + m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 3 * m_legs * 2); + + m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 2 * m_legs * 2); + + m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 2 * m_legs * 2); + m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2); + m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2); + m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2); + + m_vec_copy(m_legs, bins + 1 * m_legs * 2, out); +} +#endif + +// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2] +// [ 0 P3 ] [S2] [ P3*S2] +static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S, + const int m, const int v, const int o, const int k, uint64_t *PS) { + + const int n = o + v; +#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128) + (void)m; +#else + const int m_legs = m / 32; +#endif + + /* Old approach which is constant time but doesn't have to be + unsigned char S1[V_MAX*K_MAX]; + unsigned char S2[O_MAX*K_MAX]; + unsigned char *s1_write = S1; + unsigned char *s2_write = S2; + for (int r=0; r < k; r++) + { + for (int c = 0; c < n; c++) + { + if(c < v){ + *(s1_write++) = S[r*n + c]; + } else { + *(s2_write++) = S[r*n + c]; + } + } + } + + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1 + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2 + mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2. + */ + + // use more stack efficient version for MAYO_3 and MAYO_5 + #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78 + uint64_t accumulator[M_MAX * N_MAX] = {0}; + int P1_used; + int P3_used; + for (int col = 0; col < k; col++) { + for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) { + accumulator[i] = 0; + } + P1_used = 0; + for (int row = 0; row < v; row++) { + for (int j = row; j < v; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8); +#else + bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 ); +#endif + P1_used ++; + } + + for (int j = 0; j < o; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 ); +#else + bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 ); +#endif + } + } + + P3_used = 0; + for (int row = v; row < n; row++) { + for (int j = row; j < n; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8); +#else + bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 ); +#endif + P3_used ++; + } + } + + for (int row = 0; row < n; row++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8); +#else + bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2); +#endif + } + } + + #else + + alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0}; + int P1_used = 0; + for (int row = 0; row < v; row++) { + for (int j = row; j < v; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 ); + vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 ); + vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 ); + } +#endif + P1_used ++; + } + + + for (int j = 0; j < o; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 ); + vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 ); + vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 ); + vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 ); + } +#endif + } + } + + int P3_used = 0; + for (int row = v; row < n; row++) { + for (int j = row; j < n; j++) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9) + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4) + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 ); + vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11) + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 ); + vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 ); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12) + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 ); + vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 ); +#else + for (int col = 0; col < k; col++) { + m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 ); + } +#endif + P3_used ++; + } + } + + // multiply stuff according to the bins of the accumulator and add to PS. + int i = 0; + while (i < n * k) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8); + i++; +#else + m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2); + i++; +#endif + } + + #endif +} + + +static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int n, uint64_t *SPS){ + alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0}; + #if !defined(MAYO_VARIANT) + const int m_legs = m/32; + #else + (void) m; + #endif + for (int row = 0; row < k; row++) { + for (int j = 0; j < n; j++) { + for (int col = 0; col < k; col += 1) { + #if defined(MAYO_VARIANT) && (M_MAX == 64) + vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 ); + #elif defined(MAYO_VARIANT) && (M_MAX == 96) + vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 ); + #elif defined(MAYO_VARIANT) && (M_MAX == 128) + vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 ); + #else + m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 ); + #endif + } + } + } + + // multiply stuff according to the bins of the accumulator and add to PS. + int i = 0; + while (i < k*k) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6); + i++; +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8); + i++; +#else + m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2); + i++; +#endif + } +} + + +// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc +static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) { + + int bs_mat_entries_used = 0; + for (int r = 0; r < bs_mat_rows; r++) { + for (int c = triangular * r; c < bs_mat_cols; c++) { + for (int k = 0; k < mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k)); +#endif + } + bs_mat_entries_used += 1; + } + } +} + +// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc +static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) { + + int bs_mat_entries_used = 0; + for (int r = 0; r < bs_mat_rows; r++) { + for (int c = triangular * r; c < bs_mat_cols; c++) { + for (int k = 0; k < mat_rows; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k)); +#endif + } + bs_mat_entries_used += 1; + } + } +} + +// multiplies the transpose of a single matrix with m matrices and adds result to acc +static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) { + + for (int r = 0; r < mat_cols; r++) { + for (int c = 0; c < mat_rows; c++) { + for (int k = 0; k < bs_mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k)); +#endif + } + } + } +} + +// multiplies a single matrix with m matrices and adds result to acc +static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) { + + for (int r = 0; r < mat_rows; r++) { + for (int c = 0; c < mat_cols; c++) { + for (int k = 0; k < bs_mat_cols; k += 1) { +#if defined(MAYO_VARIANT) && (M_MAX == 64) + (void) m_legs; + vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 96) + (void) m_legs; + vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k)); +#elif defined(MAYO_VARIANT) && (M_MAX == 128) + (void) m_legs; + vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k)); +#else + m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k)); +#endif + } + } + } +} +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo-5_opt/echelon_form.h new file mode 100644 index 0000000000..82505847c9 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/echelon_form.h @@ -0,0 +1,152 @@ + +// SPDX-License-Identifier: Apache-2.0 + +#ifndef ECHELON_FORM_H +#define ECHELON_FORM_H + +#include +#include +#include +#include + +#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y)) +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) + +static inline unsigned char +m_extract_element(const uint64_t *in, int index) { + const int leg = index / 16; + const int offset = index % 16; + + return (in[leg] >> (offset*4)) & 0xF; +} + +static inline void +ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) { + int i; + unsigned char *out8 = (unsigned char *)out; + for(i = 0; i+1 < ncols; i += 2){ +#ifdef TARGET_BIG_ENDIAN + out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0) | (in[i+1] << 4); +#else + out8[i/2] = (in[i+0] << 0) | (in[i+1] << 4); +#endif + } + if (ncols % 2 == 1){ +#ifdef TARGET_BIG_ENDIAN + out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0); +#else + out8[i/2] = (in[i+0] << 0); +#endif + } +} + +static inline void +ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) { + const unsigned char *in8 = (const unsigned char *)in; + for(int i = 0; i < legs * 16; i += 2){ +#ifdef TARGET_BIG_ENDIAN + out[i] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF; + out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + out[i] = (in8[i/2]) & 0xF; + out[i+1] = (in8[i/2] >> 4); +#endif + } +} + + +// put matrix in row echelon form with ones on first nonzero entries *in +// constant time* +static inline void EF(unsigned char *A, int nrows, int ncols) { + + alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16]; + alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16]; + alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 }; + + int row_len = (ncols + 15) / 16; + + // nibbleslice the matrix A + for (int i = 0; i < nrows; i++) { + ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols); + } + + // pivot row is secret, pivot col is not + + unsigned char inverse; + int pivot_row = 0; + for (int pivot_col = 0; pivot_col < ncols; pivot_col++) { + + int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols); + int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col); + // the pivot row is guaranteed to be between these lower and upper bounds if + // A has full rank + + // zero out pivot row + for (int i = 0; i < row_len; i++) { + _pivot_row[i] = 0; + _pivot_row2[i] = 0; + } + + // try to get a pivot row in constant time + unsigned char pivot = 0; + uint64_t pivot_is_zero = -1; + for (int row = pivot_row_lower_bound; + row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) { + + uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row); + uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row); + + for (int j = 0; j < row_len; j++) { + _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) & + packed_A[row * row_len + j]; + } + pivot = m_extract_element(_pivot_row, pivot_col); + pivot_is_zero = ~ct_compare_64((int) pivot, 0); + } + + // multiply pivot row by inverse of pivot + inverse = inverse_f(pivot); + vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2); + + // conditionally write pivot row to the correct row, if there is a nonzero + // pivot + for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) { + uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero; + uint64_t do_not_copy = ~do_copy; + for (int col = 0; col < row_len; col++) { + packed_A[row * row_len + col] = + (do_not_copy & packed_A[row * row_len + col]) + + (do_copy & _pivot_row2[col]); + } + } + + // eliminate entries below pivot + for (int row = pivot_row_lower_bound; row < nrows; row++) { + unsigned char below_pivot = (row > pivot_row); + unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col); + + vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim, + packed_A + row * row_len); + } + + pivot_row += (-(int64_t)(~pivot_is_zero)); + } + + unsigned char temp[(O_MAX * K_MAX + 1 + 15)]; + + // unbitslice the matrix A + for (int i = 0; i < nrows; i++) { + ef_unpack_m_vec(row_len, packed_A + i * row_len, temp); + for (int j = 0; j < ncols; j++) { + A[i * ncols + j] = temp[j]; + } + } + + mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15); + mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8); + mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8); + mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8); +} + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/mayo.c b/src/sig/mayo/pqmayo_mayo-5_opt/mayo.c new file mode 100644 index 0000000000..ce1ccd4c71 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/mayo.c @@ -0,0 +1,656 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef ENABLE_CT_TESTING +#include +#endif + +#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y)) +#define PK_PRF AES_128_CTR + +static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) { + int i; + for (i = 0; i < mdeclen / 2; ++i) { + *mdec++ = m[i] & 0xf; + *mdec++ = m[i] >> 4; + } + + if (mdeclen % 2 == 1) { + *mdec++ = m[i] & 0x0f; + } +} + +static void encode(const unsigned char *m, unsigned char *menc, int mlen) { + int i; + for (i = 0; i < mlen / 2; ++i, m += 2) { + menc[i] = (*m) | (*(m + 1) << 4); + } + + if (mlen % 2 == 1) { + menc[i] = (*m); + } +} + +static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *vPv = _vPv; + uint64_t temp[M_MAX/16] = {0}; + unsigned char *temp_bytes = (unsigned char *) temp; + int k = 0; + for (int i = PARAM_k(p) - 1; i >= 0 ; i--) { + for (int j = i; j < PARAM_k(p); j++) { + // multiply by X (shift up 4 bits) + unsigned char top = temp[k] >> 60; + temp[k] <<= 4; + k--; + for(; k>=0; k--){ + temp[k+1] ^= temp[k] >> 60; + temp[k] <<= 4; + } + // reduce mod f(X) + for (int jj = 0; jj < F_TAIL_LEN; jj++) { + if(jj%2 == 0){ +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]); +#endif + } + else { +#ifdef TARGET_BIG_ENDIAN + temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#else + temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4; +#endif + } + } + + // extract from vPv and add + for(k=0; k < PARAM_m(p)/16; k ++){ + temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]); + } + k--; + } + } + + // add to y + for (int i = 0; i < PARAM_m(p); i+=2) + { +#ifdef TARGET_BIG_ENDIAN + y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4); +#else + y[i] = t[i] ^ (temp_bytes[i/2] & 0xF); + y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4); +#endif + + } +} + +static void transpose_16x16_nibbles(uint64_t *M){ + static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f; + static const uint64_t even_bytes = 0x00ff00ff00ff00ff; + static const uint64_t even_2bytes = 0x0000ffff0000ffff; + static const uint64_t even_half = 0x00000000ffffffff; + + for (size_t i = 0; i < 16; i+=2) + { + uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; + M[i ] ^= t << 4; + M[i+1] ^= t; + } + + for (size_t i = 0; i < 16; i+=4) + { + uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes; + uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; + M[i ] ^= (t0 << 8); + M[i+1] ^= (t1 << 8); + M[i+2] ^= t0; + M[i+3] ^= t1; + } + + for (size_t i = 0; i < 4; i++) + { + uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes; + uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes; + + M[i ] ^= t0 << 16; + M[i+ 8] ^= t1 << 16; + M[i+ 4] ^= t0; + M[i+12] ^= t1; + } + + for (size_t i = 0; i < 8; i++) + { + uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; + M[i ] ^= t << 32; + M[i+8] ^= t; + } +} + +#define MAYO_M_OVER_8 ((M_MAX + 7) / 8) + +static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){ + #ifndef ENABLE_PARAMS_DYNAMIC + (void) p; + #endif + + const uint64_t *VtL = _VtL; + int bits_to_shift = 0; + int words_to_shift = 0; + uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0}; + size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16; + const uint64_t *Mi, *Mj; + + for (int i = 0; i <= PARAM_k(p) - 1; ++i) { + for (int j = PARAM_k(p) - 1; j >= i; --j) { + // add the M_i and M_j to A, shifted "down" by l positions + Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + + if (i != j) { + Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p); + for (int c = 0; c < PARAM_o(p); c++) { + for (int k = 0; k < PARAM_m(p)/16; k++) + { + A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift; + if(bits_to_shift > 0){ + A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift); + } + } + } + } + + bits_to_shift += 4; + if(bits_to_shift == 64){ + words_to_shift ++; + bits_to_shift = 0; + } + } + } + + for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16) + { + transpose_16x16_nibbles(A + c); + } + + unsigned char tab[F_TAIL_LEN*4] = {0}; + for (size_t i = 0; i < F_TAIL_LEN; i++) + { + tab[4*i] = mul_f(PARAM_f_tail(p)[i],1); + tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2); + tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4); + tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8); + } + + uint64_t low_bit_in_nibble = 0x1111111111111111; + + for (size_t c = 0; c < A_width; c+= 16) + { + for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++) + { + size_t pos = (r/16)*A_width + c + (r%16); + uint64_t t0 = A[pos] & low_bit_in_nibble; + uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble; + uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble; + uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble; + for (size_t t = 0; t < F_TAIL_LEN; t++) + { + A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3]; + } + } + } + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) + A[i] = BSWAP64(A[i]); +#endif + + for (int r = 0; r < PARAM_m(p); r+=16) + { + for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16) + { + for (size_t i = 0; i < 16; i++) + { + decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c)); + } + } + } +} + +// Public API + +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) { + int ret = 0; + + ret = mayo_keypair_compact(p, pk, sk); + if (ret != MAYO_OK) { + goto err; + } + +err: + return ret; +} + +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data + unsigned char y[M_MAX]; // secret data + unsigned char salt[SALT_BYTES_MAX]; // not secret data + unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX], + Vdec[N_MINUS_O_MAX * K_MAX]; // secret data + unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data + unsigned char x[K_MAX * N_MAX]; // not secret data + unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data + unsigned char s[K_MAX * N_MAX]; // not secret data + const unsigned char *seed_sk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data + alignas(32) sk_t sk; // secret data + unsigned char Ox[N_MINUS_O_MAX]; // secret data + // unsigned char Mdigest[DIGEST_BYTES]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1]; + unsigned char *ctrbyte; + unsigned char *vi; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_v_bytes = PARAM_v_bytes(p); + const int param_r_bytes = PARAM_r_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P2_bytes = PARAM_P2_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_A_cols = PARAM_A_cols(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + ret = mayo_expand_sk(p, csk, &sk); + if (ret != MAYO_OK) { + goto err; + } + + seed_sk = csk; + decode(sk.o, O, (param_n - param_o) * param_o); + + // hash message + shake256(tmp, param_digest_bytes, m, mlen); + + uint64_t *P1 = sk.p; + uint64_t *L = P1 + (param_P1_bytes/8); + alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0}; + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + L[i] = BSWAP64(L[i]); + } +#endif + + // choose the randomizer + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(tmp + param_digest_bytes, param_salt_bytes); + #else + if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // hashing to salt + memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk, + param_sk_seed_bytes); + shake256(salt, param_salt_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret +#endif + + // hashing to t + memcpy(tmp + param_digest_bytes, salt, param_salt_bytes); + ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes; + + shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + + decode(tenc, t, param_m); // may not be necessary + + for (int ctr = 0; ctr <= 255; ++ctr) { + *ctrbyte = (unsigned char)ctr; + + shake256(V, param_k * param_v_bytes + param_r_bytes, tmp, + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1); + + // decode the v_i vectors + for (int i = 0; i <= param_k - 1; ++i) { + decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o), + param_n - param_o); + } + + // compute all the V * L matrices. + // compute all the V * P1 * V^T matrices. + alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0}; + V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y); + + compute_rhs(p, Y, t, y); + compute_A(p, Mtmp, A); + + decode(V + param_k * param_v_bytes, r, + param_k * + param_o); + if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) { + break; + } else { + memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t)); + } + } + + // s is already 0 + // TODO: optimize this? + for (int i = 0; i <= param_k - 1; ++i) { + vi = Vdec + i * (param_n - param_o); + mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1); + mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1); + memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o); + } + encode(s, sig, param_n * param_k); + memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes); + *siglen = param_sig_bytes; +err: + mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX); + mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX); + mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1)); + mayo_secure_clear(r, K_MAX * O_MAX + 1); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(&sk, sizeof(sk_t)); + mayo_secure_clear(Ox, N_MINUS_O_MAX); + mayo_secure_clear(tmp, + DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1); + return ret; +} + +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *csk) { + int ret = MAYO_OK; + const int param_sig_bytes = PARAM_sig_bytes(p); + size_t siglen = param_sig_bytes; + ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk); + if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes) + goto err; + + memmove(sm + param_sig_bytes, m, mlen); + *smlen = siglen + mlen; +err: + return ret; +} + +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk) { + const int param_sig_bytes = PARAM_sig_bytes(p); + if (smlen < (size_t)param_sig_bytes) { + return MAYO_ERR; + } + int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm, + pk); + + if (result == MAYO_OK) { + *mlen = smlen - param_sig_bytes; + memmove(m, sm + param_sig_bytes, *mlen); + } + + return result; +} + +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk) { + int ret = MAYO_OK; + unsigned char *seed_sk = csk; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8]; + alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0}; + + unsigned char *seed_pk; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_m = PARAM_m(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + // seed_sk $←- B^(sk_seed bytes) + #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL) + randombytes(seed_sk, param_sk_seed_bytes); + #else + if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) { + ret = MAYO_ERR; + goto err; + } + #endif + + // S ← shake256(seedsk, pk seed bytes + O bytes) + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + // seed_pk ← s[0 : pk_seed_bytes] + seed_pk = S; + + // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes]) + decode(S + param_pk_seed_bytes, O, param_v * param_o); + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + + int m_legs = param_m / 32; + + uint64_t *P1 = P; + uint64_t *P1O_P2 = P + (param_P1_bytes / 8); + + // compute P3 = O^t * (P1*O + P2) + Ot_times_P1O_P2(p, P1, O, P1O_P2, P3); + + // store seed_pk in cpk + memcpy(cpk, seed_pk, param_pk_seed_bytes); + + alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8]; + + // compute Upper(P3) and store in cpk + m_upper(m_legs, P3, P3_upper, param_o); + + memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes); + + +#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL) +err: +#endif + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2); + return ret; +} + +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *pk) { + #ifdef MAYO_VARIANT + (void)p; + #endif + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_P3_bytes = PARAM_P3_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes); + pk += param_P1_bytes + param_P2_bytes; + memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes); + return MAYO_OK; +} + +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *sk) { + int ret = MAYO_OK; + unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX]; + uint64_t *P = sk->p; + unsigned char O[(N_MINUS_O_MAX)*O_MAX]; + + const int param_o = PARAM_o(p); + const int param_v = PARAM_v(p); + const int param_O_bytes = PARAM_O_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); + const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p); + const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p); + + const unsigned char *seed_sk = csk; + unsigned char *seed_pk = S; + + shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk, + param_sk_seed_bytes); + decode(S + param_pk_seed_bytes, O, + param_v * param_o); // O = S + PK_SEED_BYTES; + +#ifdef ENABLE_CT_TESTING + VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes); +#endif + + // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format + PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk, + param_pk_seed_bytes); + + uint64_t *P2 = P + (param_P1_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + uint64_t *P1 = P; + // compute L_i = (P1 + P1^t)*O + P2 + uint64_t *L = P2; + P1P1t_times_O(p, P1, O, L); + + // write to sk + memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) { + P[i] = BSWAP64(P[i]); + } +#endif + + mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX); + mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX); + return ret; +} + +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *cpk) { + unsigned char tEnc[M_BYTES_MAX]; + unsigned char t[M_MAX]; + unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X) + unsigned char s[K_MAX * N_MAX]; + alignas (64) uint64_t pk[EPK_BYTES_MAX / 8]; + unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX]; + + const int param_m = PARAM_m(p); + const int param_n = PARAM_n(p); + const int param_v = PARAM_v(p); + const int param_o = PARAM_o(p); + const int param_k = PARAM_k(p); + const int param_m_bytes = PARAM_m_bytes(p); + const int param_P1_bytes = PARAM_P1_bytes(p); + const int param_P2_bytes = PARAM_P2_bytes(p); +#ifdef TARGET_BIG_ENDIAN + const int param_P3_bytes = PARAM_P3_bytes(p); +#endif + const int param_sig_bytes = PARAM_sig_bytes(p); + const int param_digest_bytes = PARAM_digest_bytes(p); + const int param_salt_bytes = PARAM_salt_bytes(p); + + int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk); + if (ret != MAYO_OK) { + return MAYO_ERR; + } + + uint64_t *P1 = pk; + uint64_t *P2 = pk + (param_P1_bytes / 8); + uint64_t *P3 = P2 + (param_P2_bytes / 8); + +#ifdef TARGET_BIG_ENDIAN + for (int i = 0; i < param_P1_bytes / 8; ++i) { + P1[i] = BSWAP64(P1[i]); + } + for (int i = 0; i < param_P2_bytes / 8; ++i) { + P2[i] = BSWAP64(P2[i]); + } + for (int i = 0; i < param_P3_bytes / 8; ++i) { + P3[i] = BSWAP64(P3[i]); + } +#endif + + // hash m + shake256(tmp, param_digest_bytes, m, mlen); + + // compute t + memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes, + param_salt_bytes); + shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes); + decode(tEnc, t, param_m); + + // decode s + decode(sig, s, param_k * param_n); + + // Compute S*P*S^T + alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0}; + + m_calculate_PS_SPS(P1, P2, P3, s, param_m, + param_v, param_o, param_k, SPS); + + // combine the vectors in SPS and reduce mod f(X) + compute_rhs(p, SPS, y, y); + + if (memcmp(y, t, param_m) == 0) { + return MAYO_OK; // good signature + } + return MAYO_ERR; // bad signature +} + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/mayo.h b/src/sig/mayo/pqmayo_mayo-5_opt/mayo.h new file mode 100644 index 0000000000..1de4bf2a33 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/mayo.h @@ -0,0 +1,435 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MAYO_H +#define MAYO_H + +#include +#include + +#define F_TAIL_LEN 5 +#define F_TAIL_64 \ + { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3 +#define F_TAIL_96 \ + { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x +#define F_TAIL_128 \ + { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2 + +#define MAYO_1_name "MAYO_1" +#define MAYO_1_n 66 +#define MAYO_1_m 64 +#define MAYO_1_o 8 +#define MAYO_1_v (MAYO_1_n - MAYO_1_o) +#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1) +#define MAYO_1_k 9 +#define MAYO_1_q 16 +#define MAYO_1_m_bytes 32 +#define MAYO_1_O_bytes 232 +#define MAYO_1_v_bytes 29 +#define MAYO_1_r_bytes 36 +#define MAYO_1_P1_bytes 54752 +#define MAYO_1_P2_bytes 14848 +#define MAYO_1_P3_bytes 1152 +#define MAYO_1_csk_bytes 24 +#define MAYO_1_esk_bytes 69856 +#define MAYO_1_cpk_bytes 1168 +#define MAYO_1_epk_bytes 70752 +#define MAYO_1_sig_bytes 321 +#define MAYO_1_f_tail F_TAIL_64 +#define MAYO_1_f_tail_arr f_tail_64 +#define MAYO_1_salt_bytes 24 +#define MAYO_1_digest_bytes 32 +#define MAYO_1_pk_seed_bytes 16 +#define MAYO_1_sk_seed_bytes 24 + +#define MAYO_2_name "MAYO_2" +#define MAYO_2_n 78 +#define MAYO_2_m 64 +#define MAYO_2_o 18 +#define MAYO_2_v (MAYO_2_n - MAYO_2_o) +#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1) +#define MAYO_2_k 4 +#define MAYO_2_q 16 +#define MAYO_2_m_bytes 32 +#define MAYO_2_O_bytes 540 +#define MAYO_2_v_bytes 30 +#define MAYO_2_r_bytes 36 +#define MAYO_2_P1_bytes 58560 +#define MAYO_2_P2_bytes 34560 +#define MAYO_2_P3_bytes 5472 +#define MAYO_2_csk_bytes 24 +#define MAYO_2_esk_bytes 93684 +#define MAYO_2_cpk_bytes 5488 +#define MAYO_2_epk_bytes 98592 +#define MAYO_2_sig_bytes 180 +#define MAYO_2_f_tail F_TAIL_64 +#define MAYO_2_f_tail_arr f_tail_64 +#define MAYO_2_salt_bytes 24 +#define MAYO_2_digest_bytes 32 +#define MAYO_2_pk_seed_bytes 16 +#define MAYO_2_sk_seed_bytes 24 + +#define MAYO_3_name "MAYO_3" +#define MAYO_3_n 99 +#define MAYO_3_m 96 +#define MAYO_3_o 10 +#define MAYO_3_v (MAYO_3_n - MAYO_3_o) +#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1) +#define MAYO_3_k 11 +#define MAYO_3_q 16 +#define MAYO_3_m_bytes 48 +#define MAYO_3_O_bytes 445 +#define MAYO_3_v_bytes 45 +#define MAYO_3_r_bytes 55 +#define MAYO_3_P1_bytes 192240 +#define MAYO_3_P2_bytes 42720 +#define MAYO_3_P3_bytes 2640 +#define MAYO_3_csk_bytes 32 +#define MAYO_3_esk_bytes 235437 +#define MAYO_3_cpk_bytes 2656 +#define MAYO_3_epk_bytes 237600 +#define MAYO_3_sig_bytes 577 +#define MAYO_3_f_tail F_TAIL_96 +#define MAYO_3_f_tail_arr f_tail_96 +#define MAYO_3_salt_bytes 32 +#define MAYO_3_digest_bytes 48 +#define MAYO_3_pk_seed_bytes 16 +#define MAYO_3_sk_seed_bytes 32 + +#define MAYO_5_name "MAYO_5" +#define MAYO_5_n 133 +#define MAYO_5_m 128 +#define MAYO_5_o 12 +#define MAYO_5_v (MAYO_5_n - MAYO_5_o) +#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1) +#define MAYO_5_k 12 +#define MAYO_5_q 16 +#define MAYO_5_m_bytes 64 +#define MAYO_5_O_bytes 726 +#define MAYO_5_v_bytes 61 +#define MAYO_5_r_bytes 72 +#define MAYO_5_P1_bytes 472384 +#define MAYO_5_P2_bytes 92928 +#define MAYO_5_P3_bytes 4992 +#define MAYO_5_csk_bytes 40 +#define MAYO_5_esk_bytes 566078 +#define MAYO_5_cpk_bytes 5008 +#define MAYO_5_epk_bytes 570304 +#define MAYO_5_sig_bytes 838 +#define MAYO_5_f_tail F_TAIL_128 +#define MAYO_5_f_tail_arr f_tail_128 +#define MAYO_5_salt_bytes 40 +#define MAYO_5_digest_bytes 64 +#define MAYO_5_pk_seed_bytes 16 +#define MAYO_5_sk_seed_bytes 40 + +#define PARAM_JOIN2_(a, b) a##_##b +#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b) +#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end) + +#if defined(MAYO_VARIANT) +#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c +#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c) +#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s) + +#if defined(MAYO_BUILD_TYPE_REF) +#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s) +#elif defined(MAYO_BUILD_TYPE_OPT) +#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s) +#elif defined(MAYO_BUILD_TYPE_AVX2) +#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s) +#else +#error "Build type not known" +#endif + +#else +#define MAYO_NAMESPACE(s) s +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define NAME_MAX mayo5 +#define N_MAX 133 +#define M_MAX 128 +#define O_MAX 18 +#define V_MAX 121 +#define K_MAX 12 +#define Q_MAX 16 +#define PK_SEED_BYTES_MAX 16 +#define SK_SEED_BYTES_MAX 40 +#define SALT_BYTES_MAX 40 +#define DIGEST_BYTES_MAX 64 +#define O_BYTES_MAX 726 +#define V_BYTES_MAX 61 +#define R_BYTES_MAX 72 +#define P1_BYTES_MAX 472384 +#define P2_BYTES_MAX 92928 +#define P3_BYTES_MAX 5472 +#define SIG_BYTES_MAX 838 +#define CPK_BYTES_MAX 5488 +#define CSK_BYTES_MAX 40 +#define ESK_BYTES_MAX 566078 +#define EPK_BYTES_MAX 570304 +#define N_MINUS_O_MAX 121 +#define M_BYTES_MAX 64 +#elif defined(MAYO_VARIANT) +#define M_MAX PARAM_NAME(m) +#define N_MAX PARAM_NAME(n) +#define O_MAX PARAM_NAME(o) +#define V_MAX PARAM_NAME(v) +#define K_MAX PARAM_NAME(k) +#define Q_MAX PARAM_NAME(q) +#define M_BYTES_MAX PARAM_NAME(m_bytes) +#define O_BYTES_MAX PARAM_NAME(O_bytes) +#define V_BYTES_MAX PARAM_NAME(v_bytes) +#define R_BYTES_MAX PARAM_NAME(r_bytes) +#define P1_BYTES_MAX PARAM_NAME(P1_bytes) +#define P2_BYTES_MAX PARAM_NAME(P2_bytes) +#define P3_BYTES_MAX PARAM_NAME(P3_bytes) +#define SIG_BYTES_MAX PARAM_NAME(sig_bytes) +#define CSK_BYTES_MAX PARAM_NAME(csk_bytes) +#define ESK_BYTES_MAX PARAM_NAME(esk_bytes) +#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes) +#define EPK_BYTES_MAX PARAM_NAME(epk_bytes) +#define N_MINUS_O_MAX (N_MAX - O_MAX) +#define SALT_BYTES_MAX PARAM_NAME(salt_bytes) +#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes) +#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes) +#define SK_SEED_BYTES_MAX SALT_BYTES_MAX +#else +#error "Parameter not specified" +#endif + +#ifdef ENABLE_PARAMS_DYNAMIC +#define PARAM_name(p) (p->name) +#define PARAM_m(p) (p->m) +#define PARAM_n(p) (p->n) +#define PARAM_o(p) (p->o) +#define PARAM_v(p) (p->n - p->o) +#define PARAM_A_cols(p) (p->k * p->o + 1) +#define PARAM_k(p) (p->k) +#define PARAM_q(p) (p->q) +#define PARAM_m_bytes(p) (p->m_bytes) +#define PARAM_O_bytes(p) (p->O_bytes) +#define PARAM_v_bytes(p) (p->v_bytes) +#define PARAM_r_bytes(p) (p->r_bytes) +#define PARAM_P1_bytes(p) (p->P1_bytes) +#define PARAM_P2_bytes(p) (p->P2_bytes) +#define PARAM_P3_bytes(p) (p->P3_bytes) +#define PARAM_csk_bytes(p) (p->csk_bytes) +#define PARAM_esk_bytes(p) (p->esk_bytes) +#define PARAM_cpk_bytes(p) (p->cpk_bytes) +#define PARAM_epk_bytes(p) (p->epk_bytes) +#define PARAM_sig_bytes(p) (p->sig_bytes) +#define PARAM_f_tail(p) (p->f_tail) +#define PARAM_salt_bytes(p) (p->salt_bytes) +#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes) +#define PARAM_digest_bytes(p) (p->digest_bytes) +#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes) +#elif defined(MAYO_VARIANT) +#define PARAM_name(p) PARAM_NAME(name) +#define PARAM_m(p) PARAM_NAME(m) +#define PARAM_n(p) PARAM_NAME(n) +#define PARAM_o(p) PARAM_NAME(o) +#define PARAM_v(p) PARAM_NAME(v) +#define PARAM_A_cols(p) PARAM_NAME(A_cols) +#define PARAM_k(p) PARAM_NAME(k) +#define PARAM_q(p) PARAM_NAME(q) +#define PARAM_m_bytes(p) PARAM_NAME(m_bytes) +#define PARAM_O_bytes(p) PARAM_NAME(O_bytes) +#define PARAM_v_bytes(p) PARAM_NAME(v_bytes) +#define PARAM_r_bytes(p) PARAM_NAME(r_bytes) +#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes) +#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes) +#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes) +#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes) +#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes) +#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes) +#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes) +#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes) +static const unsigned char f_tail[] = PARAM_NAME(f_tail); +#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes) +#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes) +#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes) +#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes) +#define PARAM_f_tail(p) f_tail +#else +#error "Parameter not specified" +#endif + +/** + * Struct defining MAYO parameters + */ +typedef struct { + int m; + int n; + int o; + int k; + int q; + const unsigned char *f_tail; + int m_bytes; + int O_bytes; + int v_bytes; + int r_bytes; + int R_bytes; + int P1_bytes; + int P2_bytes; + int P3_bytes; + int csk_bytes; + int esk_bytes; + int cpk_bytes; + int epk_bytes; + int sig_bytes; + int salt_bytes; + int sk_seed_bytes; + int digest_bytes; + int pk_seed_bytes; + const char *name; +} mayo_params_t; + +typedef struct sk_t { + uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8]; + uint8_t o[O_BYTES_MAX]; +} sk_t; + +/** + * MAYO parameter sets + */ +#ifdef ENABLE_PARAMS_DYNAMIC +extern const mayo_params_t MAYO_1; +extern const mayo_params_t MAYO_2; +extern const mayo_params_t MAYO_3; +extern const mayo_params_t MAYO_5; +#endif + +/** + * Status codes + */ +#define MAYO_OK 0 +#define MAYO_ERR 1 + +/** + * Mayo keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * @param[in] p Mayo parameter set + * @param[out] pk Mayo public key + * @param[out] sk Mayo secret key + * @return int status code + */ +#define mayo_keypair MAYO_NAMESPACE(mayo_keypair) +int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk); + +#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature) +int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig, + size_t *siglen, const unsigned char *m, + size_t mlen, const unsigned char *csk); + +/** + * MAYO signature generation. + * + * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec. + * Keys provided is a compacted secret keys. + * The caller is responsible to allocate sufficient memory to hold sm. + * + * @param[in] p Mayo parameter set + * @param[out] sm Signature concatenated with message + * @param[out] smlen Pointer to the length of sm + * @param[in] m Message to be signed + * @param[in] mlen Message length + * @param[in] sk Compacted secret key + * @return int status code + */ +#define mayo_sign MAYO_NAMESPACE(mayo_sign) +int mayo_sign(const mayo_params_t *p, unsigned char *sm, + size_t *smlen, const unsigned char *m, + size_t mlen, const unsigned char *sk); + +/** + * Mayo open signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m. + * Keys provided is a compact public key. + * The caller is responsible to allocate sufficient memory to hold m. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sm Signature concatenated with message + * @param[in] smlen Length of sm + * @param[in] pk Compacted public key + * @return int status code + */ +#define mayo_open MAYO_NAMESPACE(mayo_open) +int mayo_open(const mayo_params_t *p, unsigned char *m, + size_t *mlen, const unsigned char *sm, + size_t smlen, const unsigned char *pk); + +/** + * Mayo compact keypair generation. + * + * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold pk and sk. + * + * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and + * cpk are compact representations of a Mayo secret key and public key + * + * @param[in] p Mayo parameter set + * @param[out] cpk Mayo compacted public key + * @param[out] csk Mayo compacted secret key + * @return int status code + */ +#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact) +int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk, + unsigned char *csk); + +/** + * Mayo expand public key. + * + * The implementation corresponds to Mayo.expandPK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold epk. + * + * @param[in] p Mayo parameter set + * @param[in] cpk Compacted public key. + * @param[out] epk Expanded public key. + * @return int return code + */ +#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk) +int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk, + unsigned char *epk); + +/** + * Mayo expand secret key. + * + * The implementation corresponds to Mayo.expandSK() in the Mayo spec. + * The caller is responsible to allocate sufficient memory to hold esk. + * + * @param[in] p Mayo parameter set + * @param[in] csk Compacted secret key. + * @param[out] esk Expanded secret key. + * @return int return code + */ +#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk) +int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk, + sk_t *esk); + +/** + * Mayo verify signature. + * + * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1. + * Keys provided is a compact public key. + * + * @param[in] p Mayo parameter set + * @param[out] m Message stored if verification succeeds + * @param[out] mlen Pointer to the length of m + * @param[in] sig Signature + * @param[in] pk Compacted public key + * @return int 0 if verification succeeded, 1 otherwise. + */ +#define mayo_verify MAYO_NAMESPACE(mayo_verify) +int mayo_verify(const mayo_params_t *p, const unsigned char *m, + size_t mlen, const unsigned char *sig, + const unsigned char *pk); + +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/mem.h b/src/sig/mayo/pqmayo_mayo-5_opt/mem.h new file mode 100644 index 0000000000..2aa2196e2e --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/mem.h @@ -0,0 +1,66 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef MEM_H +#define MEM_H +#include +#include + +#if defined(__GNUC__) || defined(__clang__) +#define BSWAP32(i) __builtin_bswap32((i)) +#define BSWAP64(i) __builtin_bswap64((i)) +#else +#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24)) +#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32)) +#endif + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint32_t ct_is_greater_than(int a, int b) { + int32_t diff = b - a; + return (uint32_t) (diff >> (8*sizeof(uint32_t)-1)); +} + +// a > b -> b - a is negative +// returns 0xFFFFFFFF if true, 0x00000000 if false +static inline uint64_t ct_64_is_greater_than(int a, int b) { + int64_t diff = ((int64_t) b) - ((int64_t) a); + return (uint64_t) (diff >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00000000, else 0xFFFFFFFF +static inline uint32_t ct_compare_32(int a, int b) { + return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF +static inline uint64_t ct_compare_64(int a, int b) { + return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1)); +} + +// if a == b -> 0x00, else 0xFF +static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) { + return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1)); +} + +#include +/** + * Clears and frees allocated memory. + * + * @param[out] mem Memory to be cleared and freed. + * @param size Size of memory to be cleared and freed. + */ +static inline void mayo_secure_free(void *mem, size_t size) { + OQS_MEM_secure_free(mem, size); +} + +/** + * Clears memory. + * + * @param[out] mem Memory to be cleared. + * @param size Size of memory to be cleared. + */ +static inline void mayo_secure_clear(void *mem, size_t size) { + OQS_MEM_cleanse(mem, size); +} + +#endif diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/params.c b/src/sig/mayo/pqmayo_mayo-5_opt/params.c new file mode 100644 index 0000000000..043d4cedc1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/params.c @@ -0,0 +1,42 @@ +// SPDX-License-Identifier: Apache-2.0 + +#include + +#ifdef ENABLE_PARAMS_DYNAMIC +static const unsigned char f_tail_64[] = F_TAIL_64; +static const unsigned char f_tail_96[] = F_TAIL_96; +static const unsigned char f_tail_128[] = F_TAIL_128; + +#define MAYO_GEN_PARAMS(nm) \ + const mayo_params_t nm = { \ + .m = PARAM_JOIN2(nm, m), \ + .n = PARAM_JOIN2(nm, n), \ + .o = PARAM_JOIN2(nm, o), \ + .k = PARAM_JOIN2(nm, k), \ + .q = PARAM_JOIN2(nm, q), \ + .f_tail = PARAM_JOIN2(nm, f_tail_arr), \ + .m_bytes = PARAM_JOIN2(nm, m_bytes), \ + .O_bytes = PARAM_JOIN2(nm, O_bytes), \ + .v_bytes = PARAM_JOIN2(nm, v_bytes), \ + .r_bytes = PARAM_JOIN2(nm, r_bytes), \ + .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \ + .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \ + .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \ + .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \ + .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \ + .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \ + .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \ + .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \ + .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \ + .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \ + .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \ + .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \ + .name = #nm \ + }; + +MAYO_GEN_PARAMS(MAYO_1); +MAYO_GEN_PARAMS(MAYO_2); +MAYO_GEN_PARAMS(MAYO_3); +MAYO_GEN_PARAMS(MAYO_5); +#endif + diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-5_opt/simple_arithmetic.h new file mode 100644 index 0000000000..ce7580f4c1 --- /dev/null +++ b/src/sig/mayo/pqmayo_mayo-5_opt/simple_arithmetic.h @@ -0,0 +1,147 @@ +// SPDX-License-Identifier: Apache-2.0 + +#ifndef SIMPLE_ARITHMETIC_H +#define SIMPLE_ARITHMETIC_H + +// GF(16) multiplication mod x^4 + x + 1 +static inline unsigned char mul_f(unsigned char a, unsigned char b) { + // carryless multiply + unsigned char p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + unsigned char top_p = p & 0xf0; + unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f; + return out; +} + +static inline uint64_t mul_fx8(unsigned char a, uint64_t b) { + // carryless multiply + uint64_t p; + p = (a & 1)*b; + p ^= (a & 2)*b; + p ^= (a & 4)*b; + p ^= (a & 8)*b; + + // reduce mod x^4 + x + 1 + uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0; + uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f; + return out; +} + +// GF(16) addition +static inline unsigned char add_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) subtraction +static inline unsigned char sub_f(unsigned char a, unsigned char b) { + return a ^ b; +} + +// GF(16) negation +static inline unsigned char neg_f(unsigned char a) { + return a; +} + +static inline unsigned char inverse_f(unsigned char a) { + // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5, + // 10, 4, 3, 8}; return table[a & 15]; + + unsigned char a2 = mul_f(a, a); + unsigned char a4 = mul_f(a2, a2); + unsigned char a8 = mul_f(a4, a4); + unsigned char a6 = mul_f(a2, a4); + unsigned char a14 = mul_f(a8, a6); + + return a14; +} + +static inline unsigned char lincomb(const unsigned char *a, + const unsigned char *b, int n, int m) { + unsigned char ret = 0; + for (int i = 0; i < n; ++i, b += m) { + ret = add_f(mul_f(a[i], *b), ret); + } + return ret; +} + +static inline void mat_mul(const unsigned char *a, const unsigned char *b, + unsigned char *c, int colrow_ab, int row_a, int col_b) { + for (int i = 0; i < row_a; ++i, a += colrow_ab) { + for (int j = 0; j < col_b; ++j, ++c) { + *c = lincomb(a, b + j, colrow_ab, col_b); + } + } +} + +static inline void mat_add(const unsigned char *a, const unsigned char *b, + unsigned char *c, int m, int n) { + for (int i = 0; i < m; ++i) { + for (int j = 0; j < n; ++j) { + *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j)); + } + } +} + +static +inline void m_vec_copy(int m_legs, const uint64_t *in, + uint64_t *out) { + for (int i = 0; i < m_legs * 2; i++) { + out[i] = in[i]; + } +} + +static +inline void m_vec_add(int m_legs, const uint64_t *in, + uint64_t *acc) { + for (int i = 0; i < m_legs * 2; i++) { + acc[i] ^= in[i]; + } +} + +// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1) +// gf16 := gf2[x]/(x^4+x+1) + +static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) { + uint32_t a_msb; + uint32_t a32 = a; + uint32_t b32 = b; + uint32_t r32 = a32 * (b32 & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 1) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 2) & 1); + + a_msb = a32 & 0x88888888; // MSB, 3rd bits + a32 ^= a_msb; // clear MSB + a32 = (a32 << 1) ^ ((a_msb >> 3) * 3); + r32 ^= (a32) * ((b32 >> 3) & 1); + + return r32; + +} + +static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) { + for(int i=0; i < m_legs*2;i++){ + acc[i] ^= gf16v_mul_u64(in[i], a); + } +} + +static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) { + for(int i=0;i + +#if defined(OQS_ENABLE_SIG_mayo_1) +#define OQS_SIG_mayo_1_length_public_key 1168 +#define OQS_SIG_mayo_1_length_secret_key 24 +#define OQS_SIG_mayo_1_length_signature 321 + +OQS_SIG *OQS_SIG_mayo_1_new(void); +OQS_API OQS_STATUS OQS_SIG_mayo_1_keypair(uint8_t *public_key, uint8_t *secret_key); +OQS_API OQS_STATUS OQS_SIG_mayo_1_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key); +OQS_API OQS_STATUS OQS_SIG_mayo_1_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key); +#endif + +#if defined(OQS_ENABLE_SIG_mayo_2) +#define OQS_SIG_mayo_2_length_public_key 5488 +#define OQS_SIG_mayo_2_length_secret_key 24 +#define OQS_SIG_mayo_2_length_signature 180 + +OQS_SIG *OQS_SIG_mayo_2_new(void); +OQS_API OQS_STATUS OQS_SIG_mayo_2_keypair(uint8_t *public_key, uint8_t *secret_key); +OQS_API OQS_STATUS OQS_SIG_mayo_2_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key); +OQS_API OQS_STATUS OQS_SIG_mayo_2_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key); +#endif + +#if defined(OQS_ENABLE_SIG_mayo_3) +#define OQS_SIG_mayo_3_length_public_key 2656 +#define OQS_SIG_mayo_3_length_secret_key 32 +#define OQS_SIG_mayo_3_length_signature 577 + +OQS_SIG *OQS_SIG_mayo_3_new(void); +OQS_API OQS_STATUS OQS_SIG_mayo_3_keypair(uint8_t *public_key, uint8_t *secret_key); +OQS_API OQS_STATUS OQS_SIG_mayo_3_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key); +OQS_API OQS_STATUS OQS_SIG_mayo_3_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key); +#endif + +#if defined(OQS_ENABLE_SIG_mayo_5) +#define OQS_SIG_mayo_5_length_public_key 5008 +#define OQS_SIG_mayo_5_length_secret_key 40 +#define OQS_SIG_mayo_5_length_signature 838 + +OQS_SIG *OQS_SIG_mayo_5_new(void); +OQS_API OQS_STATUS OQS_SIG_mayo_5_keypair(uint8_t *public_key, uint8_t *secret_key); +OQS_API OQS_STATUS OQS_SIG_mayo_5_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key); +OQS_API OQS_STATUS OQS_SIG_mayo_5_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key); +#endif + +#endif diff --git a/src/sig/mayo/sig_mayo_1.c b/src/sig/mayo/sig_mayo_1.c new file mode 100644 index 0000000000..93034eb5ef --- /dev/null +++ b/src/sig/mayo/sig_mayo_1.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT + +#include + +#include + +#if defined(OQS_ENABLE_SIG_mayo_1) + +OQS_SIG *OQS_SIG_mayo_1_new(void) { + + OQS_SIG *sig = malloc(sizeof(OQS_SIG)); + if (sig == NULL) { + return NULL; + } + sig->method_name = OQS_SIG_alg_mayo_1; + sig->alg_version = "https://github.com/PQCMayo/MAYO-C/tree/nibbling-mayo"; + + sig->claimed_nist_level = 1; + sig->euf_cma = true; + + sig->length_public_key = OQS_SIG_mayo_1_length_public_key; + sig->length_secret_key = OQS_SIG_mayo_1_length_secret_key; + sig->length_signature = OQS_SIG_mayo_1_length_signature; + + sig->keypair = OQS_SIG_mayo_1_keypair; + sig->sign = OQS_SIG_mayo_1_sign; + sig->verify = OQS_SIG_mayo_1_verify; + + return sig; +} + +extern int pqmayo_MAYO_1_opt_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); +extern int pqmayo_MAYO_1_opt_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); +extern int pqmayo_MAYO_1_opt_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); + +#if defined(OQS_ENABLE_SIG_mayo_1_avx2) +extern int pqmayo_MAYO_1_avx2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); +extern int pqmayo_MAYO_1_avx2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); +extern int pqmayo_MAYO_1_avx2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); +#endif + +OQS_API OQS_STATUS OQS_SIG_mayo_1_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_SIG_mayo_1_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_1_avx2_crypto_sign_keypair(public_key, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_keypair(public_key, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_keypair(public_key, secret_key); +#endif +} + +OQS_API OQS_STATUS OQS_SIG_mayo_1_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_SIG_mayo_1_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_1_avx2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); +#endif +} + +OQS_API OQS_STATUS OQS_SIG_mayo_1_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) { +#if defined(OQS_ENABLE_SIG_mayo_1_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_1_avx2_crypto_sign_verify(signature, signature_len, message, message_len, public_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key); +#endif +} + +#endif diff --git a/src/sig/mayo/sig_mayo_2.c b/src/sig/mayo/sig_mayo_2.c new file mode 100644 index 0000000000..25719f0aee --- /dev/null +++ b/src/sig/mayo/sig_mayo_2.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT + +#include + +#include + +#if defined(OQS_ENABLE_SIG_mayo_2) + +OQS_SIG *OQS_SIG_mayo_2_new(void) { + + OQS_SIG *sig = malloc(sizeof(OQS_SIG)); + if (sig == NULL) { + return NULL; + } + sig->method_name = OQS_SIG_alg_mayo_2; + sig->alg_version = "https://github.com/PQCMayo/MAYO-C/tree/nibbling-mayo"; + + sig->claimed_nist_level = 1; + sig->euf_cma = true; + + sig->length_public_key = OQS_SIG_mayo_2_length_public_key; + sig->length_secret_key = OQS_SIG_mayo_2_length_secret_key; + sig->length_signature = OQS_SIG_mayo_2_length_signature; + + sig->keypair = OQS_SIG_mayo_2_keypair; + sig->sign = OQS_SIG_mayo_2_sign; + sig->verify = OQS_SIG_mayo_2_verify; + + return sig; +} + +extern int pqmayo_MAYO_2_opt_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); +extern int pqmayo_MAYO_2_opt_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); +extern int pqmayo_MAYO_2_opt_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); + +#if defined(OQS_ENABLE_SIG_mayo_2_avx2) +extern int pqmayo_MAYO_2_avx2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); +extern int pqmayo_MAYO_2_avx2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); +extern int pqmayo_MAYO_2_avx2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); +#endif + +OQS_API OQS_STATUS OQS_SIG_mayo_2_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_SIG_mayo_2_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_2_avx2_crypto_sign_keypair(public_key, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_keypair(public_key, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_keypair(public_key, secret_key); +#endif +} + +OQS_API OQS_STATUS OQS_SIG_mayo_2_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_SIG_mayo_2_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_2_avx2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); +#endif +} + +OQS_API OQS_STATUS OQS_SIG_mayo_2_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) { +#if defined(OQS_ENABLE_SIG_mayo_2_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_2_avx2_crypto_sign_verify(signature, signature_len, message, message_len, public_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key); +#endif +} + +#endif diff --git a/src/sig/mayo/sig_mayo_3.c b/src/sig/mayo/sig_mayo_3.c new file mode 100644 index 0000000000..7a68024880 --- /dev/null +++ b/src/sig/mayo/sig_mayo_3.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT + +#include + +#include + +#if defined(OQS_ENABLE_SIG_mayo_3) + +OQS_SIG *OQS_SIG_mayo_3_new(void) { + + OQS_SIG *sig = malloc(sizeof(OQS_SIG)); + if (sig == NULL) { + return NULL; + } + sig->method_name = OQS_SIG_alg_mayo_3; + sig->alg_version = "https://github.com/PQCMayo/MAYO-C/tree/nibbling-mayo"; + + sig->claimed_nist_level = 3; + sig->euf_cma = true; + + sig->length_public_key = OQS_SIG_mayo_3_length_public_key; + sig->length_secret_key = OQS_SIG_mayo_3_length_secret_key; + sig->length_signature = OQS_SIG_mayo_3_length_signature; + + sig->keypair = OQS_SIG_mayo_3_keypair; + sig->sign = OQS_SIG_mayo_3_sign; + sig->verify = OQS_SIG_mayo_3_verify; + + return sig; +} + +extern int pqmayo_MAYO_3_opt_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); +extern int pqmayo_MAYO_3_opt_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); +extern int pqmayo_MAYO_3_opt_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); + +#if defined(OQS_ENABLE_SIG_mayo_3_avx2) +extern int pqmayo_MAYO_3_avx2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); +extern int pqmayo_MAYO_3_avx2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); +extern int pqmayo_MAYO_3_avx2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); +#endif + +OQS_API OQS_STATUS OQS_SIG_mayo_3_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_SIG_mayo_3_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_3_avx2_crypto_sign_keypair(public_key, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_keypair(public_key, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_keypair(public_key, secret_key); +#endif +} + +OQS_API OQS_STATUS OQS_SIG_mayo_3_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_SIG_mayo_3_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_3_avx2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); +#endif +} + +OQS_API OQS_STATUS OQS_SIG_mayo_3_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) { +#if defined(OQS_ENABLE_SIG_mayo_3_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_3_avx2_crypto_sign_verify(signature, signature_len, message, message_len, public_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key); +#endif +} + +#endif diff --git a/src/sig/mayo/sig_mayo_5.c b/src/sig/mayo/sig_mayo_5.c new file mode 100644 index 0000000000..ca19cfbf91 --- /dev/null +++ b/src/sig/mayo/sig_mayo_5.c @@ -0,0 +1,90 @@ +// SPDX-License-Identifier: MIT + +#include + +#include + +#if defined(OQS_ENABLE_SIG_mayo_5) + +OQS_SIG *OQS_SIG_mayo_5_new(void) { + + OQS_SIG *sig = malloc(sizeof(OQS_SIG)); + if (sig == NULL) { + return NULL; + } + sig->method_name = OQS_SIG_alg_mayo_5; + sig->alg_version = "https://github.com/PQCMayo/MAYO-C/tree/nibbling-mayo"; + + sig->claimed_nist_level = 5; + sig->euf_cma = true; + + sig->length_public_key = OQS_SIG_mayo_5_length_public_key; + sig->length_secret_key = OQS_SIG_mayo_5_length_secret_key; + sig->length_signature = OQS_SIG_mayo_5_length_signature; + + sig->keypair = OQS_SIG_mayo_5_keypair; + sig->sign = OQS_SIG_mayo_5_sign; + sig->verify = OQS_SIG_mayo_5_verify; + + return sig; +} + +extern int pqmayo_MAYO_5_opt_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); +extern int pqmayo_MAYO_5_opt_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); +extern int pqmayo_MAYO_5_opt_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); + +#if defined(OQS_ENABLE_SIG_mayo_5_avx2) +extern int pqmayo_MAYO_5_avx2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk); +extern int pqmayo_MAYO_5_avx2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk); +extern int pqmayo_MAYO_5_avx2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk); +#endif + +OQS_API OQS_STATUS OQS_SIG_mayo_5_keypair(uint8_t *public_key, uint8_t *secret_key) { +#if defined(OQS_ENABLE_SIG_mayo_5_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_keypair(public_key, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_keypair(public_key, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_keypair(public_key, secret_key); +#endif +} + +OQS_API OQS_STATUS OQS_SIG_mayo_5_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) { +#if defined(OQS_ENABLE_SIG_mayo_5_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key); +#endif +} + +OQS_API OQS_STATUS OQS_SIG_mayo_5_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) { +#if defined(OQS_ENABLE_SIG_mayo_5_avx2) +#if defined(OQS_DIST_BUILD) + if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) { +#endif /* OQS_DIST_BUILD */ + return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_verify(signature, signature_len, message, message_len, public_key); +#if defined(OQS_DIST_BUILD) + } else { + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key); + } +#endif /* OQS_DIST_BUILD */ +#else + return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key); +#endif +} + +#endif diff --git a/src/sig/sig.c b/src/sig/sig.c index ae41478387..bab752c607 100644 --- a/src/sig/sig.c +++ b/src/sig/sig.c @@ -39,7 +39,11 @@ OQS_API const char *OQS_SIG_alg_identifier(size_t i) { OQS_SIG_alg_sphincs_shake_192f_simple, OQS_SIG_alg_sphincs_shake_192s_simple, OQS_SIG_alg_sphincs_shake_256f_simple, - OQS_SIG_alg_sphincs_shake_256s_simple,///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END + OQS_SIG_alg_sphincs_shake_256s_simple, + OQS_SIG_alg_mayo_1, + OQS_SIG_alg_mayo_2, + OQS_SIG_alg_mayo_3, + OQS_SIG_alg_mayo_5,///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END }; if (i >= OQS_SIG_algs_length) { return NULL; @@ -232,6 +236,34 @@ OQS_API int OQS_SIG_alg_is_enabled(const char *method_name) { #else return 0; #endif + + } else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_1)) { +#ifdef OQS_ENABLE_SIG_mayo_1 + return 1; +#else + return 0; +#endif + + } else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_2)) { +#ifdef OQS_ENABLE_SIG_mayo_2 + return 1; +#else + return 0; +#endif + + } else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_3)) { +#ifdef OQS_ENABLE_SIG_mayo_3 + return 1; +#else + return 0; +#endif + + } else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_5)) { +#ifdef OQS_ENABLE_SIG_mayo_5 + return 1; +#else + return 0; +#endif ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ENABLED_CASE_END } else { return 0; @@ -418,6 +450,34 @@ OQS_API OQS_SIG *OQS_SIG_new(const char *method_name) { #else return NULL; #endif + + } else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_1)) { +#ifdef OQS_ENABLE_SIG_mayo_1 + return OQS_SIG_mayo_1_new(); +#else + return NULL; +#endif + + } else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_2)) { +#ifdef OQS_ENABLE_SIG_mayo_2 + return OQS_SIG_mayo_2_new(); +#else + return NULL; +#endif + + } else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_3)) { +#ifdef OQS_ENABLE_SIG_mayo_3 + return OQS_SIG_mayo_3_new(); +#else + return NULL; +#endif + + } else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_5)) { +#ifdef OQS_ENABLE_SIG_mayo_5 + return OQS_SIG_mayo_5_new(); +#else + return NULL; +#endif ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_NEW_CASE_END // EDIT-WHEN-ADDING-SIG } else { diff --git a/src/sig/sig.h b/src/sig/sig.h index 6e3c3951c5..bb2a738215 100644 --- a/src/sig/sig.h +++ b/src/sig/sig.h @@ -82,12 +82,20 @@ extern "C" { #define OQS_SIG_alg_sphincs_shake_256f_simple "SPHINCS+-SHAKE-256f-simple" /** Algorithm identifier for SPHINCS+-SHAKE-256s-simple */ #define OQS_SIG_alg_sphincs_shake_256s_simple "SPHINCS+-SHAKE-256s-simple" +/** Algorithm identifier for MAYO-1 */ +#define OQS_SIG_alg_mayo_1 "MAYO-1" +/** Algorithm identifier for MAYO-2 */ +#define OQS_SIG_alg_mayo_2 "MAYO-2" +/** Algorithm identifier for MAYO-3 */ +#define OQS_SIG_alg_mayo_3 "MAYO-3" +/** Algorithm identifier for MAYO-5 */ +#define OQS_SIG_alg_mayo_5 "MAYO-5" ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END // EDIT-WHEN-ADDING-SIG ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_START /** Number of algorithm identifiers above. */ -#define OQS_SIG_algs_length 25 +#define OQS_SIG_algs_length 29 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_END /** @@ -266,6 +274,9 @@ OQS_API void OQS_SIG_free(OQS_SIG *sig); #ifdef OQS_ENABLE_SIG_SPHINCS #include #endif /* OQS_ENABLE_SIG_SPHINCS */ +#ifdef OQS_ENABLE_SIG_MAYO +#include +#endif /* OQS_ENABLE_SIG_MAYO */ ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_INCLUDE_END // EDIT-WHEN-ADDING-SIG diff --git a/tests/KATs/sig/kats.json b/tests/KATs/sig/kats.json index e60fe897ba..918b893659 100644 --- a/tests/KATs/sig/kats.json +++ b/tests/KATs/sig/kats.json @@ -27,6 +27,22 @@ "all": "362ecc0537ca1fe25143fb7ccb04de8ee7703469d13ebcf311ab124a5c374a65", "single": "91842d41138e7cfaf6e2e8f12a03c3b3411302255121e4d07d02f91a003c0395" }, + "MAYO-1": { + "all": "5cf156cf74fc65b43863399ecc4b26ad7b4b3b2cd8485215dc0c011e2825b145", + "single": "ba2473dedd92cf3b8a1fc14fc22f2ffdde972c8b64cfcd8cddb4f803e48df017" + }, + "MAYO-2": { + "all": "c0daf74b54fae78685c87b32d3b36a418bac884c3564ea96d56c6601b138d449", + "single": "72cb237642b2c0c4e7f8c824d9c8601ac7189784649d28dbb2cccfb94732c9a3" + }, + "MAYO-3": { + "all": "f66b95dda153b7df00610aa018f0644146e7e564b33562c51bb088c40fb0dcb2", + "single": "dbc49f4fdfa0de69d416051215cb53c042c4a329d325452d079f3734b7467a6b" + }, + "MAYO-5": { + "all": "7b230c2626f57159a243d8dfc69c62cb94dd0f179dd2b4f2ef3606deb6404477", + "single": "f2c1c69045c7d15e714a04119965e8a7007ef54f9293158587560227c97b237d" + }, "ML-DSA-44": { "all": "183bc0c4398ade4fc17b6a7d876b82545a96331139a4f27269c95664b8c483f9", "single": "e6f3ec4dc0b02dd3bcbbc6b105190e1890ca0bb3f802e2b571f0d70f3993a2e1" diff --git a/tests/constant_time/sig/issues.json b/tests/constant_time/sig/issues.json index 7eb295ffa0..b5ea3f5a1d 100644 --- a/tests/constant_time/sig/issues.json +++ b/tests/constant_time/sig/issues.json @@ -7,6 +7,9 @@ "Falcon-512": ["falcon"], "Falcon-padded-1024": ["falcon"], "Falcon-padded-512": ["falcon"], + "MAYO_1": [], + "MAYO_2": [], + "MAYO_3": [], "ML-DSA-44-ipd": [], "ML-DSA-65-ipd": [], "ML-DSA-87-ipd": [], diff --git a/tests/constant_time/sig/passes.json b/tests/constant_time/sig/passes.json index a6096eb640..4803e636b9 100644 --- a/tests/constant_time/sig/passes.json +++ b/tests/constant_time/sig/passes.json @@ -7,6 +7,9 @@ "Falcon-512": ["falcon_keygen", "falcon_sign"], "Falcon-padded-1024": ["falcon_keygen", "falcon_sign"], "Falcon-padded-512": ["falcon_keygen", "falcon_sign"], + "MAYO_1": ["mayo"], + "MAYO_2": ["mayo"], + "MAYO_3": ["mayo"], "ML-DSA-44-ipd": ["ml_dsa", "ml_dsa-avx2"], "ML-DSA-65-ipd": ["ml_dsa", "ml_dsa-avx2"], "ML-DSA-87-ipd": ["ml_dsa", "ml_dsa-avx2"], diff --git a/tests/constant_time/sig/passes/mayo b/tests/constant_time/sig/passes/mayo new file mode 100644 index 0000000000..9a97a98ae3 --- /dev/null +++ b/tests/constant_time/sig/passes/mayo @@ -0,0 +1,5 @@ +{ + Restart in case no solution x to Ax = y, with r used as randomness was found + Memcheck:Cond + src:arithmetic.c:282 # fun:pqmayo_MAYO_*sample_solution +} \ No newline at end of file diff --git a/tests/kat_sig.c b/tests/kat_sig.c index 21c208f3a5..ffb0456920 100644 --- a/tests/kat_sig.c +++ b/tests/kat_sig.c @@ -272,6 +272,46 @@ OQS_STATUS combine_message_signature(uint8_t **signed_msg, size_t *signed_msg_le memcpy(*signed_msg, signature, signature_len); memcpy(*signed_msg + signature_len, msg, msg_len); return OQS_SUCCESS; + } else if (0 == strcmp(sig->method_name, "MAYO-1")) { + // signed_msg = signature || msg + *signed_msg_len = signature_len + msg_len; + *signed_msg = malloc(*signed_msg_len); + if (*signed_msg == NULL) { + return OQS_ERROR; + } + memcpy(*signed_msg, signature, signature_len); + memcpy(*signed_msg + signature_len, msg, msg_len); + return OQS_SUCCESS; + } else if (0 == strcmp(sig->method_name, "MAYO-2")) { + // signed_msg = signature || msg + *signed_msg_len = signature_len + msg_len; + *signed_msg = malloc(*signed_msg_len); + if (*signed_msg == NULL) { + return OQS_ERROR; + } + memcpy(*signed_msg, signature, signature_len); + memcpy(*signed_msg + signature_len, msg, msg_len); + return OQS_SUCCESS; + } else if (0 == strcmp(sig->method_name, "MAYO-3")) { + // signed_msg = signature || msg + *signed_msg_len = signature_len + msg_len; + *signed_msg = malloc(*signed_msg_len); + if (*signed_msg == NULL) { + return OQS_ERROR; + } + memcpy(*signed_msg, signature, signature_len); + memcpy(*signed_msg + signature_len, msg, msg_len); + return OQS_SUCCESS; + } else if (0 == strcmp(sig->method_name, "MAYO-5")) { + // signed_msg = signature || msg + *signed_msg_len = signature_len + msg_len; + *signed_msg = malloc(*signed_msg_len); + if (*signed_msg == NULL) { + return OQS_ERROR; + } + memcpy(*signed_msg, signature, signature_len); + memcpy(*signed_msg + signature_len, msg, msg_len); + return OQS_SUCCESS; ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_COMBINE_MESSAGE_SIGNATURE_END } else { return OQS_ERROR; diff --git a/tests/test_aes.c b/tests/test_aes.c index 4ba265c690..bf1f5aed39 100644 --- a/tests/test_aes.c +++ b/tests/test_aes.c @@ -52,6 +52,30 @@ static int test_aes128_correctness(void) { return EXIT_SUCCESS; } +// test vector #3 from https://tools.ietf.org/html/rfc3686#section-6 +static const uint8_t test_aes128ctr_key[] = {0x76, 0x91, 0xBE, 0x03, 0x5E, 0x50, 0x20, 0xA8, 0xAC, 0x6E, 0x61, 0x85, 0x29, 0xF9, 0xA0, 0xDC}; +static const uint8_t test_aes128ctr_iv[] = {0x00, 0xE0, 0x01, 0x7B, 0x27, 0x77, 0x7F, 0x3F, 0x4A, 0x17, 0x86, 0xF0, 0x00, 0x00, 0x00, 0x01}; +static const uint8_t test_aes128ctr_plaintext[] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23}; +static const uint8_t test_aes128ctr_ciphertext[] = {0xC1, 0xCF, 0x48, 0xA8, 0x9F, 0x2F, 0xFD, 0xD9, 0xCF, 0x46, 0x52, 0xE9, 0xEF, 0xDB, 0x72, 0xD7, 0x45, 0x40, 0xA4, 0x2B, 0xDE, 0x6D, 0x78, 0x36, 0xD5, 0x9A, 0x5C, 0xEA, 0xAE, 0xF3, 0x10, 0x53, 0x25, 0xB2, 0x07, 0x2F}; + +static int test_aes128ctr_correctness(void) { + uint8_t derived_ciphertext[36]; + void *schedule = NULL; + OQS_AES128_CTR_inc_init(test_aes128ctr_key, &schedule); + OQS_AES128_CTR_inc_stream_iv(test_aes128ctr_iv, sizeof(test_aes128ctr_iv), schedule, derived_ciphertext, sizeof(derived_ciphertext)); + for (size_t i = 0; i < sizeof(derived_ciphertext); i++) { + derived_ciphertext[i] ^= test_aes128ctr_plaintext[i]; + } + if (memcmp(test_aes128ctr_ciphertext, derived_ciphertext, 36) != 0) { + printf("test_aes128ctr_correctness ciphertext does not match\n"); + OQS_print_hex_string("expected ciphertext", test_aes128ctr_ciphertext, 36); + OQS_print_hex_string("derived ciphertext", derived_ciphertext, 36); + return EXIT_FAILURE; + } + OQS_AES128_free_schedule(schedule); + return EXIT_SUCCESS; +} + static int test_aes256_correctness(void) { uint8_t derived_ciphertext[16]; void *schedule = NULL; @@ -159,6 +183,10 @@ int main(int argc, char **argv) { OQS_destroy(); return EXIT_FAILURE; } + if (test_aes128ctr_correctness() != EXIT_SUCCESS) { + OQS_destroy(); + return EXIT_FAILURE; + } if (test_aes256_correctness() != EXIT_SUCCESS) { OQS_destroy(); diff --git a/tests/test_binary.py b/tests/test_binary.py index d212f416d3..1c33093ae7 100644 --- a/tests/test_binary.py +++ b/tests/test_binary.py @@ -33,7 +33,7 @@ def test_namespace(): symbols.append(line) # ideally this would be just ['oqs', 'pqclean'], but contains exceptions (e.g., providing compat implementations of unavailable platform functions) - namespaces = ['oqs', 'pqclean', 'keccak', 'pqcrystals', 'init', 'fini', 'seedexpander', '__x86.get_pc_thunk'] + namespaces = ['oqs', 'pqclean', 'keccak', 'pqcrystals', 'pqmayo', 'init', 'fini', 'seedexpander', '__x86.get_pc_thunk'] non_namespaced = [] for symbolstr in symbols: diff --git a/tests/test_sig.c b/tests/test_sig.c index 90990adad2..185ef169c9 100644 --- a/tests/test_sig.c +++ b/tests/test_sig.c @@ -224,17 +224,30 @@ int main(int argc, char **argv) { OQS_STATUS rc; #if OQS_USE_PTHREADS #define MAX_LEN_SIG_NAME_ 64 - pthread_t thread; - struct thread_data td; - td.alg_name = alg_name; - int trc = pthread_create(&thread, NULL, test_wrapper, &td); - if (trc) { - fprintf(stderr, "ERROR: Creating pthread\n"); - OQS_destroy(); - return EXIT_FAILURE; + // don't run MAYO_5 in threads because of large stack usage + char no_thread_sig_patterns[][MAX_LEN_SIG_NAME_] = {"MAYO-5"}; + int test_in_thread = 1; + for (size_t i = 0 ; i < sizeof(no_thread_sig_patterns) / MAX_LEN_SIG_NAME_; ++i) { + if (strstr(alg_name, no_thread_sig_patterns[i]) != NULL) { + test_in_thread = 0; + break; + } + } + if (test_in_thread) { + pthread_t thread; + struct thread_data td; + td.alg_name = alg_name; + int trc = pthread_create(&thread, NULL, test_wrapper, &td); + if (trc) { + fprintf(stderr, "ERROR: Creating pthread\n"); + OQS_destroy(); + return EXIT_FAILURE; + } + pthread_join(thread, NULL); + rc = td.rc; + } else { + rc = sig_test_correctness(alg_name); } - pthread_join(thread, NULL); - rc = td.rc; #else rc = sig_test_correctness(alg_name); #endif diff --git a/zephyr/CMakeLists.txt b/zephyr/CMakeLists.txt index ff772d895e..29be2f9d55 100644 --- a/zephyr/CMakeLists.txt +++ b/zephyr/CMakeLists.txt @@ -121,6 +121,13 @@ if(CONFIG_LIBOQS) set(OQS_ENABLE_SIG_SPHINCS OFF) endif() + if(CONFIG_LIBOQS_ENABLE_SIG_MAYO) + set(OQS_ENABLE_SIG_MAYO ON) + set(OQS_ENABLE_SIG_mayo_5 OFF) + else() + set(OQS_ENABLE_SIG_MAYO OFF) + endif() + # Add the actual liboqs targets add_subdirectory(.. build) diff --git a/zephyr/Kconfig b/zephyr/Kconfig index e7ceb8eac8..9f34817012 100644 --- a/zephyr/Kconfig +++ b/zephyr/Kconfig @@ -66,4 +66,9 @@ config LIBOQS_ENABLE_SIG_SPHINCS default y depends on LIBOQS +config LIBOQS_ENABLE_SIG_MAYO + bool "Enable the MAYO signature algorithm" + default y + depends on LIBOQS + endmenu diff --git a/zephyr/samples/Signatures/prj.conf b/zephyr/samples/Signatures/prj.conf index 821b1889d3..1e0ef6749c 100644 --- a/zephyr/samples/Signatures/prj.conf +++ b/zephyr/samples/Signatures/prj.conf @@ -8,12 +8,12 @@ CONFIG_LIBOQS_ENABLE_SIG_DILITHIUM=y CONFIG_PICOLIBC=y CONFIG_TEST_RANDOM_GENERATOR=y -# Set the stack size to 128K -CONFIG_MAIN_STACK_SIZE=131072 +# Set the stack size to 512K +CONFIG_MAIN_STACK_SIZE=524288 -# Enable malloc and set the available size to 128K +# Enable malloc and set the available size to 256K CONFIG_COMMON_LIBC_MALLOC=y -CONFIG_COMMON_LIBC_MALLOC_ARENA_SIZE=131072 +CONFIG_COMMON_LIBC_MALLOC_ARENA_SIZE=262144 CONFIG_SPEED_OPTIMIZATIONS=y CONFIG_LOG=y diff --git a/zephyr/samples/Signatures/sample.yaml b/zephyr/samples/Signatures/sample.yaml index 1f9c30cc8f..601f7d4ab2 100644 --- a/zephyr/samples/Signatures/sample.yaml +++ b/zephyr/samples/Signatures/sample.yaml @@ -10,7 +10,7 @@ common: tests: sample.crypto.liboqs_signature_example: - timeout: 900 + timeout: 1500 integration_platforms: - qemu_x86 - qemu_cortex_a53 diff --git a/zephyr/samples/Signatures/src/main.c b/zephyr/samples/Signatures/src/main.c index 6e8b596596..9b2bc54822 100644 --- a/zephyr/samples/Signatures/src/main.c +++ b/zephyr/samples/Signatures/src/main.c @@ -171,15 +171,14 @@ int main(void) const char *alg_name = OQS_SIG_alg_identifier(i); if (!OQS_SIG_alg_is_enabled(alg_name)) { printf("Signature algorithm %s not enabled!\n", alg_name); - OQS_destroy(); - return EXIT_FAILURE; } + else { + rc = sig_test_correctness(alg_name); - rc = sig_test_correctness(alg_name); - - if (rc != OQS_SUCCESS) { - OQS_destroy(); - return EXIT_FAILURE; + if (rc != OQS_SUCCESS) { + OQS_destroy(); + return EXIT_FAILURE; + } } }