diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake
index 73e972dd70..7309c800f3 100644
--- a/.CMake/alg_support.cmake
+++ b/.CMake/alg_support.cmake
@@ -166,6 +166,12 @@ cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_192f_simple "" ON "OQS_ENABL
cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_192s_simple "" ON "OQS_ENABLE_SIG_SPHINCS" OFF)
cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_256f_simple "" ON "OQS_ENABLE_SIG_SPHINCS" OFF)
cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_256s_simple "" ON "OQS_ENABLE_SIG_SPHINCS" OFF)
+
+option(OQS_ENABLE_SIG_MAYO "Enable mayo algorithm family" ON)
+cmake_dependent_option(OQS_ENABLE_SIG_mayo_1 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
+cmake_dependent_option(OQS_ENABLE_SIG_mayo_2 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
+cmake_dependent_option(OQS_ENABLE_SIG_mayo_3 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
+cmake_dependent_option(OQS_ENABLE_SIG_mayo_5 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ENABLE_BY_ALG_END
if((OQS_MINIMAL_BUILD STREQUAL "ON"))
@@ -184,6 +190,8 @@ elseif (${OQS_ALGS_ENABLED} STREQUAL "STD")
##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_LIST_STANDARDIZED_ALGS_END
elseif(${OQS_ALGS_ENABLED} STREQUAL "NIST_R4")
filter_algs("KEM_classic_mceliece_348864;KEM_classic_mceliece_348864f;KEM_classic_mceliece_460896;KEM_classic_mceliece_460896f;KEM_classic_mceliece_6688128;KEM_classic_mceliece_6688128f;KEM_classic_mceliece_6960119;KEM_classic_mceliece_6960119f;KEM_classic_mceliece_8192128;KEM_classic_mceliece_8192128f;KEM_hqc_128;KEM_hqc_192;KEM_hqc_256;KEM_bike_l1;KEM_bike_l3;KEM_bike_l5")
+elseif(${OQS_ALGS_ENABLED} STREQUAL "NIST_SIG_ONRAMP")
+ filter_algs("SIG_mayo_1;SIG_mayo_2;SIG_mayo_3;SIG_mayo_5")
else()
message(STATUS "Alg enablement unchanged")
endif()
@@ -495,6 +503,31 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
endif()
endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+ cmake_dependent_option(OQS_ENABLE_SIG_mayo_1_avx2 "" ON "OQS_ENABLE_SIG_mayo_1" OFF)
+endif()
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+ cmake_dependent_option(OQS_ENABLE_SIG_mayo_2_avx2 "" ON "OQS_ENABLE_SIG_mayo_2" OFF)
+endif()
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+ cmake_dependent_option(OQS_ENABLE_SIG_mayo_3_avx2 "" ON "OQS_ENABLE_SIG_mayo_3" OFF)
+endif()
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+ cmake_dependent_option(OQS_ENABLE_SIG_mayo_5_avx2 "" ON "OQS_ENABLE_SIG_mayo_5" OFF)
+endif()
+endif()
+
##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ENABLE_BY_ALG_CONDITIONAL_END
option(OQS_ENABLE_SIG_STFL_XMSS "Enable XMSS algorithm family" OFF)
diff --git a/.github/workflows/release-test.yml b/.github/workflows/release-test.yml
index 47957f4d20..2a4addd541 100644
--- a/.github/workflows/release-test.yml
+++ b/.github/workflows/release-test.yml
@@ -17,7 +17,7 @@ on:
jobs:
oqs-provider-release-test:
- if: github.event_name == 'release' || endsWith( github.event.head_commit.message, '[trigger downstream]' )
+ if: github.event_name == 'release' || contains( github.event.head_commit.message, '[trigger downstream]' )
runs-on: ubuntu-latest
steps:
- name: Checkout release tests script
diff --git a/.github/workflows/unix.yml b/.github/workflows/unix.yml
index 5882d9bc8f..49d520eaee 100644
--- a/.github/workflows/unix.yml
+++ b/.github/workflows/unix.yml
@@ -112,6 +112,11 @@ jobs:
container: openquantumsafe/ci-ubuntu-focal-x86_64:latest
CMAKE_ARGS: -DOQS_STRICT_WARNINGS=ON -DOQS_ALGS_ENABLED=NIST_R4
PYTEST_ARGS: --ignore=tests/test_leaks.py --ignore=tests/test_kat_all.py
+ - name: focal-nistonramp-openssl
+ runner: ubuntu-latest
+ container: openquantumsafe/ci-ubuntu-focal-x86_64:latest
+ CMAKE_ARGS: -DOQS_STRICT_WARNINGS=ON -DOQS_ALGS_ENABLED=NIST_SIG_ONRAMP
+ PYTEST_ARGS: --ignore=tests/test_leaks.py --ignore=tests/test_kat_all.py
- name: jammy-std-openssl3
runner: ubuntu-latest
container: openquantumsafe/ci-ubuntu-jammy:latest
@@ -256,7 +261,7 @@ jobs:
- name: Install dependencies
run: env HOMEBREW_NO_AUTO_UPDATE=1 brew install ninja && pip3 install --require-hashes --break-system-packages -r .github/workflows/requirements.txt
- name: Patch GCC
- run: env HOMEBREW_NO_AUTO_UPDATE=1 brew uninstall --ignore-dependencies gcc@13 && wget https://raw.githubusercontent.com/Homebrew/homebrew-core/eb6dd225d093b66054e18e07d56509cf670793b1/Formula/g/gcc%4013.rb && env HOMEBREW_NO_AUTO_UPDATE=1 brew install --ignore-dependencies gcc@13.rb
+ run: env HOMEBREW_NO_AUTO_UPDATE=1 brew uninstall --ignore-dependencies gcc@13 && wget https://raw.githubusercontent.com/Homebrew/homebrew-core/eb6dd225d093b66054e18e07d56509cf670793b1/Formula/g/gcc%4013.rb && env HOMEBREW_NO_AUTO_UPDATE=1 brew install --ignore-dependencies --formula gcc@13.rb
- name: Get system information
run: sysctl -a | grep machdep.cpu
- name: Configure
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0524a07c5b..ebbd58962f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -200,6 +200,9 @@ endif()
if(OQS_ENABLE_SIG_SPHINCS)
set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/sig/sphincs/sig_sphincs.h)
endif()
+if(OQS_ENABLE_SIG_MAYO)
+ set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/sig/mayo/sig_mayo.h)
+endif()
##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_INCLUDE_HEADERS_END
if(OQS_ENABLE_SIG_STFL_XMSS)
set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/sig_stfl/xmss/sig_stfl_xmss.h)
diff --git a/CONFIGURE.md b/CONFIGURE.md
index 9bae9f5af2..d22c9fa34f 100644
--- a/CONFIGURE.md
+++ b/CONFIGURE.md
@@ -58,9 +58,9 @@ For a full list of such options and their default values, consult [.CMake/alg_su
## OQS_ALGS_ENABLED
-A selected algorithm set is enabled. Possible values are "STD" selecting all algorithms standardized by NIST; "NIST_R4" selecting all algorithms evaluated in round 4 of the NIST PQC competition; "All" (or any other value) selecting all algorithms integrated into liboqs. Parameter setting "STD" minimizes library size but may require re-running code generator scripts in projects integrating `liboqs`; e.g., [oqs-provider](https://github.com/open-quantum-safe/oqs-provider) and [oqs-boringssl](https://github.com/open-quantum-safe/boringssl).
+A selected algorithm set is enabled. Possible values are "STD" selecting all algorithms standardized by NIST; "NIST_R4" selecting all algorithms evaluated in round 4 of the NIST PQC competition; "NIST_SIG_ONRAMP" selecting algorithms evaluated in the NIST PQC "onramp" standardization for additional signature schemes; "All" (or any other value) selecting all algorithms integrated into liboqs. Parameter setting "STD" minimizes library size but may require re-running code generator scripts in projects integrating `liboqs`; e.g., [oqs-provider](https://github.com/open-quantum-safe/oqs-provider) and [oqs-boringssl](https://github.com/open-quantum-safe/boringssl).
-**Attention**: If you use any predefined value (`STD` or `NIST_R4` as of now) for this variable, the values added via [OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG](#OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG) variables will be ignored.
+**Attention**: If you use any predefined value (`STD` or `NIST_R4` or `NIST_SIG_ONRAMP` as of now) for this variable, the values added via [OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG](#OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG) variables will be ignored.
**Default**: `All`.
diff --git a/PLATFORMS.md b/PLATFORMS.md
index 60f695d886..e2220229ae 100644
--- a/PLATFORMS.md
+++ b/PLATFORMS.md
@@ -62,4 +62,3 @@ In this policy, the words "must" and "must not" specify absolute requirements th
- x86 for Windows (Visual Studio Toolchain)
- ppc64le for Ubuntu (Focal)
- s390x for Ubuntu (Focal)
-
diff --git a/README.md b/README.md
index 2b8122b4d7..b21281e2cf 100644
--- a/README.md
+++ b/README.md
@@ -65,6 +65,7 @@ All names other than `ML-KEM` and `ML-DSA` are subject to change. `liboqs` makes
- **CRYSTALS-Dilithium**: Dilithium2, Dilithium3, Dilithium5
- **Falcon**: Falcon-512, Falcon-1024, Falcon-padded-512, Falcon-padded-1024
+- **MAYO**: MAYO-1, MAYO-2, MAYO-3, MAYO-5†
- **ML-DSA**: ML-DSA-44-ipd (alias: ML-DSA-44), ML-DSA-65-ipd (alias: ML-DSA-65), ML-DSA-87-ipd (alias: ML-DSA-87)
- **SPHINCS+-SHA2**: SPHINCS+-SHA2-128f-simple, SPHINCS+-SHA2-128s-simple, SPHINCS+-SHA2-192f-simple, SPHINCS+-SHA2-192s-simple, SPHINCS+-SHA2-256f-simple, SPHINCS+-SHA2-256s-simple
- **SPHINCS+-SHAKE**: SPHINCS+-SHAKE-128f-simple, SPHINCS+-SHAKE-128s-simple, SPHINCS+-SHAKE-192f-simple, SPHINCS+-SHAKE-192s-simple, SPHINCS+-SHAKE-256f-simple, SPHINCS+-SHAKE-256s-simple
@@ -197,6 +198,7 @@ liboqs includes some third party libraries or modules that are licensed differen
- `src/sig/dilithium/pqcrystals-*`: public domain (CC0) or Apache License v2.0
- `src/sig/dilithium/pqclean_*`: public domain (CC0), and public domain (CC0) or Apache License v2.0, and public domain (CC0) or MIT, and MIT
- src/sig/falcon/pqclean_\*\_aarch64 : Apache License v2.0
+- `src/sig/mayo/*`: Apache License v2.0
- `src/sig/ml_dsa/pqcrystals-*`: public domain (CC0) or Apache License v2.0
- `src/sig/sphincs/pqclean_*`: CC0 (public domain)
diff --git a/docs/algorithms/sig/mayo.md b/docs/algorithms/sig/mayo.md
new file mode 100644
index 0000000000..3174058f13
--- /dev/null
+++ b/docs/algorithms/sig/mayo.md
@@ -0,0 +1,62 @@
+# MAYO
+
+- **Algorithm type**: Digital signature scheme.
+- **Main cryptographic assumption**: multivariable quadratic equations, oil and vinegar.
+- **Principal submitters**: Ward Beullens, Fabio Campos, Sofía Celi, Basil Hess, Matthias J. Kannwischer.
+- **Authors' website**: https://pqmayo.org
+- **Specification version**: https://doi.org/10.46586/tches.v2024.i2.252-275.
+- **Primary Source**:
+ - **Source**: https://github.com/PQCMayo/MAYO-C/commit/cde2675ff404b0ae070e7dbc3d962ea0b026a81e with copy_from_upstream patches
+ - **Implementation license (SPDX-Identifier)**: Apache-2.0
+
+
+## Parameter set summary
+
+| Parameter set | Parameter set alias | Security model | Claimed NIST Level | Public key size (bytes) | Secret key size (bytes) | Signature size (bytes) |
+|:---------------:|:----------------------|:-----------------|---------------------:|--------------------------:|--------------------------:|-------------------------:|
+| MAYO-1 | NA | EUF-CMA | 1 | 1168 | 24 | 321 |
+| MAYO-2 | NA | EUF-CMA | 1 | 5488 | 24 | 180 |
+| MAYO-3 | NA | EUF-CMA | 3 | 2656 | 32 | 577 |
+| MAYO-5 | NA | EUF-CMA | 5 | 5008 | 40 | 838 |
+
+## MAYO-1 implementation characteristics
+
+| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage?‡ |
+|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
+| [Primary Source](#primary-source) | opt | All | All | None | True | True | False |
+| [Primary Source](#primary-source) | avx2 | x86\_64 | Darwin,Linux | AVX2 | True | True | False |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+ ‡For an explanation of what this denotes, consult the [Explanation of Terms](#explanation-of-terms) section at the end of this file.
+
+## MAYO-2 implementation characteristics
+
+| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? |
+|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+| [Primary Source](#primary-source) | opt | All | All | None | True | True | False |
+| [Primary Source](#primary-source) | avx2 | x86\_64 | Darwin,Linux | AVX2 | True | True | False |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+## MAYO-3 implementation characteristics
+
+| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? |
+|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+| [Primary Source](#primary-source) | opt | All | All | None | True | True | False |
+| [Primary Source](#primary-source) | avx2 | x86\_64 | Darwin,Linux | AVX2 | True | True | False |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+## MAYO-5 implementation characteristics
+
+| Implementation source | Identifier in upstream | Supported architecture(s) | Supported operating system(s) | CPU extension(s) used | No branching-on-secrets claimed? | No branching-on-secrets checked by valgrind? | Large stack usage? |
+|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+| [Primary Source](#primary-source) | opt | All | All | None | True | True | False |
+| [Primary Source](#primary-source) | avx2 | x86\_64 | Darwin,Linux | AVX2 | True | True | True |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+## Explanation of Terms
+
+- **Large Stack Usage**: Implementations identified as having such may cause failures when running in threads or in constrained environments.
\ No newline at end of file
diff --git a/docs/algorithms/sig/mayo.yml b/docs/algorithms/sig/mayo.yml
new file mode 100644
index 0000000000..0d84b9381d
--- /dev/null
+++ b/docs/algorithms/sig/mayo.yml
@@ -0,0 +1,143 @@
+name: MAYO
+type: signature
+principal-submitters:
+- Ward Beullens
+- Fabio Campos
+- Sofía Celi
+- Basil Hess
+- Matthias J. Kannwischer
+crypto-assumption: multivariable quadratic equations, oil and vinegar
+website: https://pqmayo.org
+nist-round: 1
+spec-version: https://doi.org/10.46586/tches.v2024.i2.252-275
+primary-upstream:
+ source: https://github.com/PQCMayo/MAYO-C/commit/cde2675ff404b0ae070e7dbc3d962ea0b026a81e
+ with copy_from_upstream patches
+ spdx-license-identifier: Apache-2.0
+parameter-sets:
+- name: MAYO-1
+ claimed-nist-level: 1
+ claimed-security: EUF-CMA
+ length-public-key: 1168
+ length-secret-key: 24
+ length-signature: 321
+ implementations-switch-on-runtime-cpu-features: true
+ implementations:
+ - upstream: primary-upstream
+ upstream-id: opt
+ supported-platforms: all
+ common-crypto:
+ - SHA3: liboqs
+ - AES: liboqs
+ no-secret-dependent-branching-claimed: true
+ no-secret-dependent-branching-checked-by-valgrind: true
+ large-stack-usage: false
+ - upstream: primary-upstream
+ upstream-id: avx2
+ supported-platforms:
+ - architecture: x86_64
+ operating_systems:
+ - Darwin
+ - Linux
+ required_flags:
+ - avx2
+ common-crypto:
+ - SHA3: liboqs
+ - AES: liboqs
+ no-secret-dependent-branching-claimed: true
+ no-secret-dependent-branching-checked-by-valgrind: true
+ large-stack-usage: false
+- name: MAYO-2
+ claimed-nist-level: 1
+ claimed-security: EUF-CMA
+ length-public-key: 5488
+ length-secret-key: 24
+ length-signature: 180
+ implementations-switch-on-runtime-cpu-features: true
+ implementations:
+ - upstream: primary-upstream
+ upstream-id: opt
+ supported-platforms: all
+ common-crypto:
+ - SHA3: liboqs
+ - AES: liboqs
+ no-secret-dependent-branching-claimed: true
+ no-secret-dependent-branching-checked-by-valgrind: true
+ large-stack-usage: false
+ - upstream: primary-upstream
+ upstream-id: avx2
+ supported-platforms:
+ - architecture: x86_64
+ operating_systems:
+ - Darwin
+ - Linux
+ required_flags:
+ - avx2
+ common-crypto:
+ - SHA3: liboqs
+ - AES: liboqs
+ no-secret-dependent-branching-claimed: true
+ no-secret-dependent-branching-checked-by-valgrind: true
+ large-stack-usage: false
+- name: MAYO-3
+ claimed-nist-level: 3
+ claimed-security: EUF-CMA
+ length-public-key: 2656
+ length-secret-key: 32
+ length-signature: 577
+ implementations-switch-on-runtime-cpu-features: true
+ implementations:
+ - upstream: primary-upstream
+ upstream-id: opt
+ supported-platforms: all
+ common-crypto:
+ - SHA3: liboqs
+ no-secret-dependent-branching-claimed: true
+ no-secret-dependent-branching-checked-by-valgrind: true
+ large-stack-usage: false
+ - upstream: primary-upstream
+ upstream-id: avx2
+ supported-platforms:
+ - architecture: x86_64
+ operating_systems:
+ - Darwin
+ - Linux
+ required_flags:
+ - avx2
+ common-crypto:
+ - SHA3: liboqs
+ - AES: liboqs
+ no-secret-dependent-branching-claimed: true
+ no-secret-dependent-branching-checked-by-valgrind: true
+ large-stack-usage: false
+- name: MAYO-5
+ claimed-nist-level: 5
+ claimed-security: EUF-CMA
+ length-public-key: 5008
+ length-secret-key: 40
+ length-signature: 838
+ implementations-switch-on-runtime-cpu-features: true
+ implementations:
+ - upstream: primary-upstream
+ upstream-id: opt
+ supported-platforms: all
+ common-crypto:
+ - SHA3: liboqs
+ no-secret-dependent-branching-claimed: true
+ no-secret-dependent-branching-checked-by-valgrind: true
+ large-stack-usage: false
+ - upstream: primary-upstream
+ upstream-id: avx2
+ supported-platforms:
+ - architecture: x86_64
+ operating_systems:
+ - Darwin
+ - Linux
+ required_flags:
+ - avx2
+ common-crypto:
+ - SHA3: liboqs
+ - AES: liboqs
+ no-secret-dependent-branching-claimed: true
+ no-secret-dependent-branching-checked-by-valgrind: true
+ large-stack-usage: true
diff --git a/docs/cbom.json b/docs/cbom.json
index 7dd47dc218..358fc28b39 100644
--- a/docs/cbom.json
+++ b/docs/cbom.json
@@ -1,23 +1,23 @@
{
"bomFormat": "CBOM",
"specVersion": "1.4-cbom-1.0",
- "serialNumber": "urn:uuid:b3ac0f3d-b320-4f0f-bbef-6c535c1e9874",
+ "serialNumber": "urn:uuid:004d7395-7601-44af-97dd-57c2214e5f60",
"version": 1,
"metadata": {
- "timestamp": "2024-03-05T11:49:42.428605",
+ "timestamp": "2024-07-11T15:22:22.228289",
"component": {
"type": "library",
- "bom-ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
+ "bom-ref": "pkg:github/open-quantum-safe/liboqs@ca5d956097e10672aaa9bb7994057bcc58291b65",
"name": "liboqs",
- "version": "1f393bfe3690c6ef1cac9070d166995ce4fb3e9d"
+ "version": "ca5d956097e10672aaa9bb7994057bcc58291b65"
}
},
"components": [
{
"type": "library",
- "bom-ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
+ "bom-ref": "pkg:github/open-quantum-safe/liboqs@ca5d956097e10672aaa9bb7994057bcc58291b65",
"name": "liboqs",
- "version": "1f393bfe3690c6ef1cac9070d166995ce4fb3e9d"
+ "version": "ca5d956097e10672aaa9bb7994057bcc58291b65"
},
{
"type": "crypto-asset",
@@ -1539,6 +1539,166 @@
"nistQuantumSecurityLevel": 5
}
},
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:MAYO-1:generic",
+ "name": "MAYO",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "MAYO-1",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "generic"
+ },
+ "nistQuantumSecurityLevel": 1
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:MAYO-1:x86_64",
+ "name": "MAYO",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "MAYO-1",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "x86_64"
+ },
+ "nistQuantumSecurityLevel": 1
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:MAYO-2:generic",
+ "name": "MAYO",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "MAYO-2",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "generic"
+ },
+ "nistQuantumSecurityLevel": 1
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:MAYO-2:x86_64",
+ "name": "MAYO",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "MAYO-2",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "x86_64"
+ },
+ "nistQuantumSecurityLevel": 1
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:MAYO-3:generic",
+ "name": "MAYO",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "MAYO-3",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "generic"
+ },
+ "nistQuantumSecurityLevel": 3
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:MAYO-3:x86_64",
+ "name": "MAYO",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "MAYO-3",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "x86_64"
+ },
+ "nistQuantumSecurityLevel": 3
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:MAYO-5:generic",
+ "name": "MAYO",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "MAYO-5",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "generic"
+ },
+ "nistQuantumSecurityLevel": 5
+ }
+ },
+ {
+ "type": "crypto-asset",
+ "bom-ref": "alg:MAYO-5:x86_64",
+ "name": "MAYO",
+ "cryptoProperties": {
+ "assetType": "algorithm",
+ "algorithmProperties": {
+ "variant": "MAYO-5",
+ "primitive": "signature",
+ "implementationLevel": "softwarePlainRam",
+ "cryptoFunctions": [
+ "keygen",
+ "sign",
+ "verify"
+ ],
+ "implementationPlatform": "x86_64"
+ },
+ "nistQuantumSecurityLevel": 5
+ }
+ },
{
"type": "crypto-asset",
"bom-ref": "alg:ML-DSA-44-ipd:generic",
@@ -2168,7 +2328,7 @@
],
"dependencies": [
{
- "ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
+ "ref": "pkg:github/open-quantum-safe/liboqs@ca5d956097e10672aaa9bb7994057bcc58291b65",
"dependsOn": [
"alg:BIKE-L1:x86_64",
"alg:BIKE-L3:x86_64",
@@ -2246,6 +2406,14 @@
"alg:Falcon-padded-1024:generic",
"alg:Falcon-padded-1024:x86_64",
"alg:Falcon-padded-1024:armv8-a",
+ "alg:MAYO-1:generic",
+ "alg:MAYO-1:x86_64",
+ "alg:MAYO-2:generic",
+ "alg:MAYO-2:x86_64",
+ "alg:MAYO-3:generic",
+ "alg:MAYO-3:x86_64",
+ "alg:MAYO-5:generic",
+ "alg:MAYO-5:x86_64",
"alg:ML-DSA-44-ipd:generic",
"alg:ML-DSA-44-ipd:x86_64",
"alg:ML-DSA-65-ipd:generic",
@@ -2843,6 +3011,68 @@
],
"dependencyType": "uses"
},
+ {
+ "ref": "alg:MAYO-1:generic",
+ "dependsOn": [
+ "alg:sha3",
+ "alg:aes"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:MAYO-1:x86_64",
+ "dependsOn": [
+ "alg:sha3",
+ "alg:aes"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:MAYO-2:generic",
+ "dependsOn": [
+ "alg:sha3",
+ "alg:aes"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:MAYO-2:x86_64",
+ "dependsOn": [
+ "alg:sha3",
+ "alg:aes"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:MAYO-3:generic",
+ "dependsOn": [
+ "alg:sha3"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:MAYO-3:x86_64",
+ "dependsOn": [
+ "alg:sha3",
+ "alg:aes"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:MAYO-5:generic",
+ "dependsOn": [
+ "alg:sha3"
+ ],
+ "dependencyType": "uses"
+ },
+ {
+ "ref": "alg:MAYO-5:x86_64",
+ "dependsOn": [
+ "alg:sha3",
+ "alg:aes"
+ ],
+ "dependencyType": "uses"
+ },
{
"ref": "alg:ML-DSA-44-ipd:generic",
"dependsOn": [
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index 3417180c7c..f9582fa74f 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -53,6 +53,14 @@ upstreams:
sig_meta_path: '{pretty_name_full}_META.yml'
sig_scheme_path: '.'
patches: [pqcrystals-ml_dsa_ipd.patch]
+ -
+ name: pqmayo
+ git_url: https://github.com/PQCMayo/MAYO-C.git
+ git_branch: nibbling-mayo
+ git_commit: cde2675ff404b0ae070e7dbc3d962ea0b026a81e
+ sig_meta_path: 'META/{pretty_name_full}_META.yml'
+ sig_scheme_path: '.'
+ patches: [pqmayo-aes.patch, pqmayo-mem.patch]
kems:
-
name: classic_mceliece
@@ -301,3 +309,28 @@ sigs:
pqclean_scheme: sphincs-shake-256s-simple
pretty_name_full: SPHINCS+-SHAKE-256s-simple
signed_msg_order: sig_then_msg
+ -
+ name: mayo
+ default_implementation: opt
+ upstream_location: pqmayo
+ schemes:
+ -
+ scheme: "1"
+ pqclean_scheme: mayo-1
+ pretty_name_full: MAYO-1
+ signed_msg_order: sig_then_msg
+ -
+ scheme: "2"
+ pqclean_scheme: mayo-2
+ pretty_name_full: MAYO-2
+ signed_msg_order: sig_then_msg
+ -
+ scheme: "3"
+ pqclean_scheme: mayo-3
+ pretty_name_full: MAYO-3
+ signed_msg_order: sig_then_msg
+ -
+ scheme: "5"
+ pqclean_scheme: mayo-5
+ pretty_name_full: MAYO-5
+ signed_msg_order: sig_then_msg
diff --git a/scripts/copy_from_upstream/patches/pqmayo-aes.patch b/scripts/copy_from_upstream/patches/pqmayo-aes.patch
new file mode 100644
index 0000000000..2dd469eed3
--- /dev/null
+++ b/scripts/copy_from_upstream/patches/pqmayo-aes.patch
@@ -0,0 +1,22 @@
+diff --git a/src/common/aes_ctr.h b/src/common/aes_ctr.h
+index c47c01e..c5fd013 100644
+--- a/src/common/aes_ctr.h
++++ b/src/common/aes_ctr.h
+@@ -16,8 +16,14 @@ int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen);
+ #define AES_128_CTR AES_128_CTR_NI
+ #else
+-int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+- const unsigned char *input, size_t inputByteLen);
++#include
++static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
++ const unsigned char *input, size_t inputByteLen) {
++ (void) inputByteLen;
++ uint8_t iv[12] = { 0 };
++ aes128ctr_prf(output, outputByteLen, input, iv);
++ return (int) outputByteLen;
++}
+ #endif
+
+ #endif
+
\ No newline at end of file
diff --git a/scripts/copy_from_upstream/patches/pqmayo-mem.patch b/scripts/copy_from_upstream/patches/pqmayo-mem.patch
new file mode 100644
index 0000000000..ab47b79a06
--- /dev/null
+++ b/scripts/copy_from_upstream/patches/pqmayo-mem.patch
@@ -0,0 +1,33 @@
+diff --git a/include/mem.h b/include/mem.h
+index 4695847..dc5172c 100644
+--- a/include/mem.h
++++ b/include/mem.h
+@@ -40,13 +40,16 @@ static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+ return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+ }
+
++#include
+ /**
+ * Clears and frees allocated memory.
+ *
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+-void mayo_secure_free(void *mem, size_t size);
++static inline void mayo_secure_free(void *mem, size_t size) {
++ OQS_MEM_secure_free(mem, size);
++}
+
+ /**
+ * Clears memory.
+@@ -54,6 +57,8 @@ void mayo_secure_free(void *mem, size_t size);
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+-void mayo_secure_clear(void *mem, size_t size);
++static inline void mayo_secure_clear(void *mem, size_t size) {
++ OQS_MEM_cleanse(mem, size);
++}
+
+ #endif
+\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a5b64fd294..25a9b74086 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -55,6 +55,10 @@ if(OQS_ENABLE_SIG_SPHINCS)
add_subdirectory(sig/sphincs)
set(SIG_OBJS ${SIG_OBJS} ${SPHINCS_OBJS})
endif()
+if(OQS_ENABLE_SIG_MAYO)
+ add_subdirectory(sig/mayo)
+ set(SIG_OBJS ${SIG_OBJS} ${MAYO_OBJS})
+endif()
##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ALG_OBJECTS_END
if(OQS_ENABLE_SIG_STFL_XMSS)
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 73b917e07c..d82b4ea268 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -26,7 +26,7 @@ else()
if (OQS_DIST_X86_64_BUILD OR OQS_USE_AES_INSTRUCTIONS)
set(AES_IMPL ${AES_IMPL} aes/aes128_ni.c)
set(AES_IMPL ${AES_IMPL} aes/aes256_ni.c)
- set_source_files_properties(aes/aes128_ni.c PROPERTIES COMPILE_FLAGS -maes)
+ set_source_files_properties(aes/aes128_ni.c PROPERTIES COMPILE_FLAGS "-maes -mssse3")
set_source_files_properties(aes/aes256_ni.c PROPERTIES COMPILE_FLAGS "-maes -mssse3")
elseif (OQS_DIST_ARM64_V8_BUILD)
set(AES_IMPL ${AES_IMPL} aes/aes128_armv8.c)
diff --git a/src/common/aes/aes.c b/src/common/aes/aes.c
index 3ac8794991..01e473b819 100644
--- a/src/common/aes/aes.c
+++ b/src/common/aes/aes.c
@@ -19,6 +19,18 @@ void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **schedule) {
callbacks->AES128_ECB_load_schedule(key, schedule);
}
+void OQS_AES128_CTR_inc_init(const uint8_t *key, void **_schedule) {
+ callbacks->AES128_CTR_inc_init(key, _schedule);
+}
+
+void OQS_AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *_schedule) {
+ callbacks->AES128_CTR_inc_iv(iv, iv_len, _schedule);
+}
+
+void OQS_AES128_CTR_inc_ivu64(uint64_t iv, void *_schedule) {
+ callbacks->AES128_CTR_inc_ivu64(iv, _schedule);
+}
+
void OQS_AES128_free_schedule(void *schedule) {
callbacks->AES128_free_schedule(schedule);
}
@@ -51,6 +63,10 @@ void OQS_AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len
callbacks->AES128_ECB_enc_sch(plaintext, plaintext_len, schedule, ciphertext);
}
+void OQS_AES128_CTR_inc_stream_iv(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+ callbacks->AES128_CTR_inc_stream_iv(iv, iv_len, schedule, out, out_len);
+}
+
void OQS_AES256_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) {
callbacks->AES256_ECB_enc(plaintext, plaintext_len, key, ciphertext);
}
diff --git a/src/common/aes/aes.h b/src/common/aes/aes.h
index 011686b3e9..d0d6d634bc 100644
--- a/src/common/aes/aes.h
+++ b/src/common/aes/aes.h
@@ -28,6 +28,37 @@ extern "C" {
*/
void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **ctx);
+/**
+ * Function to initialize a context and fill a key schedule given an initial key for
+ * use in CTR mode.
+ *
+ * @param key Initial Key.
+ * @param ctx Abstract data structure for a key schedule.
+ */
+void OQS_AES128_CTR_inc_init(const uint8_t *key, void **ctx);
+
+/**
+ * Function to fill a context given an IV for use in CTR mode.
+ *
+ * Handles a 12- or 16-byte IV. If a 12-byte IV is given, then 4 counter
+ * bytes are initialized to all zeros.
+ *
+ * @param iv Initialization Vector.
+ * @param iv_len Length of the initialization vector.
+ * @param ctx Abstract data structure for IV.
+ */
+void OQS_AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *ctx);
+
+/**
+ * Function to fill a context given an IV for use in CTR mode.
+ * Handles an 8-byte IV passed as a 64-bit unsigned integer,
+ * counter bytes are initialized to zero.
+ *
+ * @param iv Initialization Vector as 64-bit integer.
+ * @param ctx Abstract data structure for IV.
+ */
+void OQS_AES128_CTR_inc_ivu64(uint64_t iv, void *ctx);
+
/**
* Function to free a key schedule.
*
@@ -55,6 +86,21 @@ void OQS_AES128_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, co
*/
void OQS_AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
+/**
+ * AES counter mode keystream generator. A context generated by
+ * OQS_AES128_CTR_inc_init() is passed rather then a key.
+ *
+ * Handles a 12- or 16-byte IV. If a 12-byte IV is given, then 4 counter
+ * bytes are initialized to all zeros.
+ *
+ * @param iv 12- or 16-byte initialization vector.
+ * @param iv_len Lengh of IV in bytes.
+ * @param ctx Abstract data structure for a key schedule.
+ * @param out Pointer to a block of memory which is big enough to contain out_len bytes; the result will be written here.
+ * @param out_len Length of output bytes to generate.
+ */
+void OQS_AES128_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *ctx, uint8_t *out, size_t out_len);
+
/**
* Function to fill a key schedule given an initial key for use in ECB mode encryption.
*
diff --git a/src/common/aes/aes128_armv8.c b/src/common/aes/aes128_armv8.c
index b5003ad018..292539fefa 100644
--- a/src/common/aes/aes128_armv8.c
+++ b/src/common/aes/aes128_armv8.c
@@ -3,15 +3,28 @@
#include
#include
#include
+#include
#include
#include
-#define PQC_AES128_STATESIZE 88
typedef struct {
- uint64_t sk_exp[PQC_AES128_STATESIZE];
+ uint64_t sk_exp[22];
+ uint8_t iv[16];
} aes128ctx;
+void oqs_aes128_load_iv_armv8(const uint8_t *iv, size_t iv_len, void *_schedule) {
+ aes128ctx *ctx = _schedule;
+ if (iv_len == 12) {
+ memcpy(ctx->iv, iv, 12);
+ memset(&ctx->iv[12], 0, 4);
+ } else if (iv_len == 16) {
+ memcpy(ctx->iv, iv, 16);
+ } else {
+ exit(EXIT_FAILURE);
+ }
+}
+
// From crypto_core/aes128encrypt/dolbeau/armv8crypto
static inline void aes128_armv8_encrypt(const unsigned char *rkeys, const unsigned char *n, unsigned char *out) {
uint8x16_t temp = vld1q_u8(n);
@@ -62,3 +75,64 @@ void oqs_aes128_ecb_enc_sch_armv8(const uint8_t *plaintext, const size_t plainte
oqs_aes128_enc_sch_block_armv8(plaintext + (16 * block), (const void *) ctx->sk_exp, ciphertext + (16 * block));
}
}
+
+static uint32_t UINT32_TO_BE(const uint32_t x) {
+ union {
+ uint32_t val;
+ uint8_t bytes[4];
+ } y;
+ y.bytes[0] = (x >> 24) & 0xFF;
+ y.bytes[1] = (x >> 16) & 0xFF;
+ y.bytes[2] = (x >> 8) & 0xFF;
+ y.bytes[3] = x & 0xFF;
+ return y.val;
+}
+#define BE_TO_UINT32(n) (uint32_t)((((uint8_t *) &(n))[0] << 24) | (((uint8_t *) &(n))[1] << 16) | (((uint8_t *) &(n))[2] << 8) | (((uint8_t *) &(n))[3] << 0))
+
+
+void oqs_aes128_ctr_enc_sch_upd_blks_armv8(void *schedule, uint8_t *out, size_t out_blks) {
+ aes128ctx *ctx = (aes128ctx *) schedule;
+ uint8_t *block = ctx->iv;
+ uint32_t ctr;
+ uint32_t ctr_be;
+ memcpy(&ctr_be, &block[12], 4);
+ ctr = BE_TO_UINT32(ctr_be);
+ while (out_blks >= 1) {
+ oqs_aes128_enc_sch_block_armv8(block, schedule, out);
+ out += 16;
+ out_blks--;
+ ctr++;
+ ctr_be = UINT32_TO_BE(ctr);
+ memcpy(&block[12], (uint8_t *) &ctr_be, 4);
+ }
+}
+
+void oqs_aes128_ctr_enc_sch_armv8(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+ uint8_t block[16];
+ uint32_t ctr;
+ uint32_t ctr_be;
+ memcpy(block, iv, 12);
+ if (iv_len == 12) {
+ ctr = 0;
+ } else if (iv_len == 16) {
+ memcpy(&ctr_be, &iv[12], 4);
+ ctr = BE_TO_UINT32(ctr_be);
+ } else {
+ exit(EXIT_FAILURE);
+ }
+ while (out_len >= 16) {
+ ctr_be = UINT32_TO_BE(ctr);
+ memcpy(&block[12], (uint8_t *) &ctr_be, 4);
+ oqs_aes128_enc_sch_block_armv8(block, schedule, out);
+ out += 16;
+ out_len -= 16;
+ ctr++;
+ }
+ if (out_len > 0) {
+ uint8_t tmp[16];
+ ctr_be = UINT32_TO_BE(ctr);
+ memcpy(&block[12], (uint8_t *) &ctr_be, 4);
+ oqs_aes128_enc_sch_block_armv8(block, schedule, tmp);
+ memcpy(out, tmp, out_len);
+ }
+}
diff --git a/src/common/aes/aes128_ni.c b/src/common/aes/aes128_ni.c
index 0593614503..b08a3041a4 100644
--- a/src/common/aes/aes128_ni.c
+++ b/src/common/aes/aes128_ni.c
@@ -5,9 +5,16 @@
#include
#include
#include
+#include
#include
#include
+#include
+
+typedef struct {
+ __m128i sk_exp[11];
+ __m128i iv;
+} aes128ctx;
// From crypto_core/aes128ncrypt/dolbeau/aesenc-int
static inline void aes128ni_setkey_encrypt(const unsigned char *key, __m128i rkeys[11]) {
@@ -42,21 +49,39 @@ static inline void aes128ni_setkey_encrypt(const unsigned char *key, __m128i rke
}
void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule) {
- *_schedule = malloc(11 * sizeof(__m128i));
+ *_schedule = malloc(sizeof(aes128ctx));
+ OQS_EXIT_IF_NULLPTR(*_schedule, "AES");
assert(*_schedule != NULL);
- __m128i *schedule = (__m128i *) *_schedule;
+ __m128i *schedule = ((aes128ctx *) *_schedule)->sk_exp;
aes128ni_setkey_encrypt(key, schedule);
}
+void oqs_aes128_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule) {
+ aes128ctx *ctx = _schedule;
+ __m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
+ if (iv_len == 12) {
+ const int32_t *ivi = (const int32_t *) iv;
+ ctx->iv = _mm_shuffle_epi8(_mm_set_epi32(0, ivi[2], ivi[1], ivi[0]), idx);
+ } else if (iv_len == 16) {
+ ctx->iv = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)iv), idx);
+ } else {
+ exit(EXIT_FAILURE);
+ }
+}
+
+void oqs_aes128_load_iv_u64_ni(uint64_t iv, void *_schedule) {
+ aes128ctx *ctx = _schedule;
+ ctx->iv = _mm_loadl_epi64((__m128i *)&iv);
+}
+
void oqs_aes128_free_schedule_ni(void *schedule) {
if (schedule != NULL) {
- OQS_MEM_secure_free(schedule, 11 * sizeof(__m128i));
+ OQS_MEM_secure_free(schedule, sizeof(aes128ctx));
}
}
// From crypto_core/aes128encrypt/dolbeau/aesenc-int
-static inline void aes128ni_encrypt(const __m128i rkeys[11], const unsigned char *n, unsigned char *out) {
- __m128i nv = _mm_loadu_si128((const __m128i *)n);
+static inline void aes128ni_encrypt(const __m128i rkeys[11], __m128i nv, unsigned char *out) {
__m128i temp = _mm_xor_si128(nv, rkeys[0]);
temp = _mm_aesenc_si128(temp, rkeys[1]);
temp = _mm_aesenc_si128(temp, rkeys[2]);
@@ -71,9 +96,45 @@ static inline void aes128ni_encrypt(const __m128i rkeys[11], const unsigned char
_mm_storeu_si128((__m128i *)(out), temp);
}
+// 4x interleaved encryption
+static inline void aes128ni_encrypt_x4(const __m128i rkeys[11], __m128i n0,
+ __m128i n1, __m128i n2, __m128i n3,
+ unsigned char *out) {
+ __m128i temp0 = _mm_xor_si128(n0, rkeys[0]);
+ __m128i temp1 = _mm_xor_si128(n1, rkeys[0]);
+ __m128i temp2 = _mm_xor_si128(n2, rkeys[0]);
+ __m128i temp3 = _mm_xor_si128(n3, rkeys[0]);
+
+#define AESNENCX4(IDX) \
+ temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]); \
+ temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]); \
+ temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]); \
+ temp3 = _mm_aesenc_si128(temp3, rkeys[IDX])
+
+ AESNENCX4(1);
+ AESNENCX4(2);
+ AESNENCX4(3);
+ AESNENCX4(4);
+ AESNENCX4(5);
+ AESNENCX4(6);
+ AESNENCX4(7);
+ AESNENCX4(8);
+ AESNENCX4(9);
+
+ temp0 = _mm_aesenclast_si128(temp0, rkeys[10]);
+ temp1 = _mm_aesenclast_si128(temp1, rkeys[10]);
+ temp2 = _mm_aesenclast_si128(temp2, rkeys[10]);
+ temp3 = _mm_aesenclast_si128(temp3, rkeys[10]);
+
+ _mm_storeu_si128((__m128i *)(out + 0), temp0);
+ _mm_storeu_si128((__m128i *)(out + 16), temp1);
+ _mm_storeu_si128((__m128i *)(out + 32), temp2);
+ _mm_storeu_si128((__m128i *)(out + 48), temp3);
+}
+
void oqs_aes128_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext) {
- const __m128i *schedule = (const __m128i *) _schedule;
- aes128ni_encrypt(schedule, plaintext, ciphertext);
+ const __m128i *schedule = ((const aes128ctx *) _schedule)->sk_exp;
+ aes128ni_encrypt(schedule, _mm_loadu_si128((const __m128i *)plaintext), ciphertext);
}
void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
@@ -82,3 +143,61 @@ void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_
oqs_aes128_enc_sch_block_ni(plaintext + (16 * block), schedule, ciphertext + (16 * block));
}
}
+
+void oqs_aes128_ctr_enc_sch_upd_blks_ni(void *schedule, uint8_t *out, size_t out_blks) {
+ aes128ctx *ctx = (aes128ctx *) schedule;
+ const __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
+
+ while (out_blks >= 4) {
+ __m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask);
+ __m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0)), mask);
+ __m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(2, 0)), mask);
+ __m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(3, 0)), mask);
+ aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
+ ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(4, 0));
+ out += 64;
+ out_blks -= 4;
+ }
+ while (out_blks >= 1) {
+ __m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask);
+ aes128ni_encrypt(schedule, nv0, out);
+ ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0));
+ out += 16;
+ out_blks--;
+ }
+}
+
+void oqs_aes128_ctr_enc_sch_ni(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+ __m128i block;
+ __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
+ if (iv_len == 12) {
+ const int32_t *ivi = (const int32_t *) iv;
+ block = _mm_set_epi32(0, ivi[2], ivi[1], ivi[0]);
+ } else if (iv_len == 16) {
+ block = _mm_loadu_si128((const __m128i *)iv);
+ } else {
+ exit(EXIT_FAILURE);
+ }
+
+ while (out_len >= 64) {
+ __m128i nv0 = block;
+ __m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask);
+ __m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), mask);
+ __m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), mask);
+ aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
+ block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), mask);
+ out += 64;
+ out_len -= 64;
+ }
+ while (out_len >= 16) {
+ aes128ni_encrypt(schedule, block, out);
+ out += 16;
+ out_len -= 16;
+ block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask);
+ }
+ if (out_len > 0) {
+ uint8_t tmp[16];
+ aes128ni_encrypt(schedule, block, tmp);
+ memcpy(out, tmp, out_len);
+ }
+}
diff --git a/src/common/aes/aes_c.c b/src/common/aes/aes_c.c
index 6ee93bc76a..f2ec57a500 100644
--- a/src/common/aes/aes_c.c
+++ b/src/common/aes/aes_c.c
@@ -574,6 +574,39 @@ static void aes_ecb(unsigned char *out, const unsigned char *in, size_t nblocks,
}
}
+static inline void aes128_ctr_upd_blks(unsigned char *out, size_t outblks, aes128ctx *ctx) {
+ uint32_t ivw[16];
+ size_t i;
+ uint32_t cc;
+ uint8_t *iv = ctx->iv;
+ uint32_t blocks = (uint32_t) outblks;
+ unsigned int nrounds = 10;
+
+ br_range_dec32le(ivw, 4, iv);
+
+ memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t));
+ memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t));
+ memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t));
+ cc = br_swap32(ivw[3]);
+ ivw[ 7] = br_swap32(cc + 1);
+ ivw[11] = br_swap32(cc + 2);
+ ivw[15] = br_swap32(cc + 3);
+
+ while (outblks >= 4) {
+ aes_ctr4x(out, ivw, ctx->sk_exp, nrounds);
+ out += 64;
+ outblks -= 4;
+ }
+ if (outblks > 0) {
+ unsigned char tmp[64];
+ aes_ctr4x(tmp, ivw, ctx->sk_exp, nrounds);
+ for (i = 0; i < outblks * 16; i++) {
+ out[i] = tmp[i];
+ }
+ }
+ br_enc32be(&ctx->iv[12], cc + blocks);
+}
+
static inline void aes256_ctr_upd_blks(unsigned char *out, size_t outblks, aes256ctx *ctx) {
uint32_t ivw[16];
size_t i;
@@ -725,12 +758,48 @@ void oqs_aes128_load_schedule_no_bitslice(const uint8_t *key, void **_schedule)
aes_keysched_no_bitslice(schedule, (const unsigned char *) key, 16);
}
+void oqs_aes128_load_iv_c(const uint8_t *iv, size_t iv_len, void *_schedule) {
+ aes128ctx *ctx = _schedule;
+ if (iv_len == 12) {
+ memcpy(ctx->iv, iv, 12);
+ memset(&ctx->iv[12], 0, 4);
+ } else if (iv_len == 16) {
+ memcpy(ctx->iv, iv, 16);
+ } else {
+ exit(EXIT_FAILURE);
+ }
+}
+
+void oqs_aes128_load_iv_u64_c(uint64_t iv, void *schedule) {
+ OQS_EXIT_IF_NULLPTR(schedule, "AES");
+ aes128ctx *ctx = (aes128ctx *) schedule;
+ ctx->iv[7] = (unsigned char)(iv >> 56);
+ ctx->iv[6] = (unsigned char)(iv >> 48);
+ ctx->iv[5] = (unsigned char)(iv >> 40);
+ ctx->iv[4] = (unsigned char)(iv >> 32);
+ ctx->iv[3] = (unsigned char)(iv >> 24);
+ ctx->iv[2] = (unsigned char)(iv >> 16);
+ ctx->iv[1] = (unsigned char)(iv >> 8);
+ ctx->iv[0] = (unsigned char)iv;
+ memset(&ctx->iv[8], 0, 8);
+}
+
void oqs_aes128_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
assert(plaintext_len % 16 == 0);
const aes128ctx *ctx = (const aes128ctx *) schedule;
aes_ecb(ciphertext, plaintext, plaintext_len / 16, ctx->sk_exp, 10);
}
+void oqs_aes128_ctr_enc_sch_c(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+ const aes128ctx *ctx = (const aes128ctx *) schedule;
+ aes_ctr(out, out_len, iv, iv_len, ctx->sk_exp, 10);
+}
+
+void oqs_aes128_ctr_enc_sch_upd_blks_c(void *schedule, uint8_t *out, size_t out_blks) {
+ aes128ctx *ctx = (aes128ctx *) schedule;
+ aes128_ctr_upd_blks(out, out_blks, ctx);
+}
+
void oqs_aes256_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
assert(plaintext_len % 16 == 0);
const aes256ctx *ctx = (const aes256ctx *) schedule;
diff --git a/src/common/aes/aes_impl.c b/src/common/aes/aes_impl.c
index ae9be662cf..706a5f186f 100644
--- a/src/common/aes/aes_impl.c
+++ b/src/common/aes/aes_impl.c
@@ -46,6 +46,26 @@ static void AES128_ECB_load_schedule(const uint8_t *key, void **_schedule) {
);
}
+static void AES128_CTR_inc_init(const uint8_t *key, void **_schedule) {
+ AES128_ECB_load_schedule(key, _schedule);
+}
+
+static void AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *_schedule) {
+ C_OR_NI_OR_ARM(
+ oqs_aes128_load_iv_c(iv, iv_len, _schedule),
+ oqs_aes128_load_iv_ni(iv, iv_len, _schedule),
+ oqs_aes128_load_iv_armv8(iv, iv_len, _schedule)
+ );
+}
+
+static void AES128_CTR_inc_ivu64(uint64_t iv, void *_schedule) {
+ C_OR_NI_OR_ARM(
+ oqs_aes128_load_iv_u64_c(iv, _schedule),
+ oqs_aes128_load_iv_u64_ni(iv, _schedule),
+ (void) iv; (void) _schedule
+ );
+}
+
static void AES128_free_schedule(void *schedule) {
C_OR_NI_OR_ARM(
oqs_aes128_free_schedule_c(schedule),
@@ -107,6 +127,14 @@ static void AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_
);
}
+static void AES128_CTR_inc_stream_iv(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+ C_OR_NI_OR_ARM(
+ oqs_aes128_ctr_enc_sch_c(iv, iv_len, schedule, out, out_len),
+ oqs_aes128_ctr_enc_sch_ni(iv, iv_len, schedule, out, out_len),
+ oqs_aes128_ctr_enc_sch_armv8(iv, iv_len, schedule, out, out_len)
+ );
+}
+
static void AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
static void AES256_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) {
@@ -141,19 +169,23 @@ static void AES256_CTR_inc_stream_blks(void *schedule, uint8_t *out, size_t out_
}
struct OQS_AES_callbacks aes_default_callbacks = {
- AES128_ECB_load_schedule,
- AES128_free_schedule,
- AES128_ECB_enc,
- AES128_ECB_enc_sch,
- AES256_ECB_load_schedule,
- AES256_CTR_inc_init,
- AES256_CTR_inc_iv,
- AES256_CTR_inc_ivu64,
- AES256_free_schedule,
- AES256_ECB_enc,
- AES256_ECB_enc_sch,
- AES256_CTR_inc_stream_iv,
- AES256_CTR_inc_stream_blks,
+ .AES128_ECB_load_schedule = AES128_ECB_load_schedule,
+ .AES128_CTR_inc_init = AES128_CTR_inc_init,
+ .AES128_CTR_inc_iv = AES128_CTR_inc_iv,
+ .AES128_CTR_inc_ivu64 = AES128_CTR_inc_ivu64,
+ .AES128_free_schedule = AES128_free_schedule,
+ .AES128_ECB_enc = AES128_ECB_enc,
+ .AES128_ECB_enc_sch = AES128_ECB_enc_sch,
+ .AES128_CTR_inc_stream_iv = AES128_CTR_inc_stream_iv,
+ .AES256_ECB_load_schedule = AES256_ECB_load_schedule,
+ .AES256_CTR_inc_init = AES256_CTR_inc_init,
+ .AES256_CTR_inc_iv = AES256_CTR_inc_iv,
+ .AES256_CTR_inc_ivu64 = AES256_CTR_inc_ivu64,
+ .AES256_free_schedule = AES256_free_schedule,
+ .AES256_ECB_enc = AES256_ECB_enc,
+ .AES256_ECB_enc_sch = AES256_ECB_enc_sch,
+ .AES256_CTR_inc_stream_iv = AES256_CTR_inc_stream_iv,
+ .AES256_CTR_inc_stream_blks = AES256_CTR_inc_stream_blks,
};
void OQS_AES_init(void) {
diff --git a/src/common/aes/aes_local.h b/src/common/aes/aes_local.h
index 4c9942a085..a9001a2e31 100644
--- a/src/common/aes/aes_local.h
+++ b/src/common/aes/aes_local.h
@@ -3,18 +3,29 @@
#include
void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule);
+void oqs_aes128_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule);
+void oqs_aes128_load_iv_u64_ni(uint64_t iv, void *_schedule);
void oqs_aes128_free_schedule_ni(void *schedule);
void oqs_aes128_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext);
void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
+void oqs_aes128_ctr_enc_sch_ni(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len);
+void oqs_aes128_ctr_enc_sch_upd_blks_ni(void *schedule, uint8_t *out, size_t out_len);
void oqs_aes128_load_schedule_c(const uint8_t *key, void **_schedule);
+void oqs_aes128_load_iv_c(const uint8_t *iv, size_t iv_len, void *_schedule);
+void oqs_aes128_load_iv_u64_c(uint64_t iv, void *_schedule);
void oqs_aes128_free_schedule_c(void *schedule);
void oqs_aes128_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
+void oqs_aes128_ctr_enc_sch_c(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len);
+void oqs_aes128_ctr_enc_sch_upd_blks_c(void *schedule, uint8_t *out, size_t out_len);
void oqs_aes128_load_schedule_no_bitslice(const uint8_t *key, void **_schedule);
+void oqs_aes128_load_iv_armv8(const uint8_t *iv, size_t iv_len, void *_schedule);
void oqs_aes128_free_schedule_no_bitslice(void *schedule);
void oqs_aes128_enc_sch_block_armv8(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext);
void oqs_aes128_ecb_enc_sch_armv8(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
+void oqs_aes128_ctr_enc_sch_armv8(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len);
+void oqs_aes128_ctr_enc_sch_upd_blks_armv8(void *schedule, uint8_t *out, size_t out_blks);
void oqs_aes256_load_schedule_ni(const uint8_t *key, void **_schedule);
void oqs_aes256_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule);
diff --git a/src/common/aes/aes_ops.h b/src/common/aes/aes_ops.h
index 5a26f75764..a64c47d28d 100644
--- a/src/common/aes/aes_ops.h
+++ b/src/common/aes/aes_ops.h
@@ -25,6 +25,21 @@ struct OQS_AES_callbacks {
*/
void (*AES128_ECB_load_schedule)(const uint8_t *key, void **ctx);
+ /**
+ * Implementation of function OQS_AES256_CTR_inc_init.
+ */
+ void (*AES128_CTR_inc_init)(const uint8_t *key, void **ctx);
+
+ /**
+ * Implementation of function OQS_AES256_CTR_inc_iv.
+ */
+ void (*AES128_CTR_inc_iv)(const uint8_t *iv, size_t iv_len, void *ctx);
+
+ /**
+ * Implementation of function OQS_AES256_CTR_inc_ivu64.
+ */
+ void (*AES128_CTR_inc_ivu64)(uint64_t iv, void *ctx);
+
/**
* Implementation of function OQS_AES128_free_schedule.
*/
@@ -40,6 +55,11 @@ struct OQS_AES_callbacks {
*/
void (*AES128_ECB_enc_sch)(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
+ /**
+ * Implementation of function OQS_AES128_CTR_inc_stream_iv.
+ */
+ void (*AES128_CTR_inc_stream_iv)(const uint8_t *iv, size_t iv_len, const void *ctx, uint8_t *out, size_t out_len);
+
/**
* Implementation of function OQS_AES256_ECB_load_schedule.
*/
diff --git a/src/common/aes/aes_ossl.c b/src/common/aes/aes_ossl.c
index feaff39557..c7dc5b9445 100644
--- a/src/common/aes/aes_ossl.c
+++ b/src/common/aes/aes_ossl.c
@@ -66,6 +66,67 @@ static void AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_
OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptFinal_ex)(ks->ctx, ciphertext, &outlen));
}
+static void AES128_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+ EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new());
+ OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
+ uint8_t iv_ctr[16];
+ if (iv_len == 12) {
+ memcpy(iv_ctr, iv, 12);
+ iv_ctr[12] = 0;
+ iv_ctr[13] = 0;
+ iv_ctr[14] = 0;
+ iv_ctr[15] = 0;
+ } else if (iv_len == 16) {
+ memcpy(iv_ctr, iv, 16);
+ } else {
+ exit(EXIT_FAILURE);
+ }
+ const struct key_schedule *ks = (const struct key_schedule *) schedule;
+ OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptInit_ex)(ctr_ctx, oqs_aes_128_ctr(), NULL, ks->key, iv_ctr));
+
+ SIZE_T_TO_INT_OR_EXIT(out_len, out_len_input_int)
+ memset(out, 0, (size_t)out_len_input_int);
+ int out_len_output;
+ OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptUpdate)(ctr_ctx, out, &out_len_output, out, out_len_input_int));
+ OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptFinal_ex)(ctr_ctx, out + out_len_output, &out_len_output));
+ OSSL_FUNC(EVP_CIPHER_CTX_free)(ctr_ctx);
+}
+
+static void AES128_CTR_inc_init(const uint8_t *key, void **schedule) {
+ *schedule = malloc(sizeof(struct key_schedule));
+ OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
+
+ struct key_schedule *ks = (struct key_schedule *) *schedule;
+ EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)();
+ OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
+
+ ks->for_ECB = 0;
+ ks->ctx = ctr_ctx;
+ memcpy(ks->key, key, 16);
+}
+
+static void AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *schedule) {
+ OQS_EXIT_IF_NULLPTR(schedule, "OpenSSL");
+ struct key_schedule *ks = (struct key_schedule *) schedule;
+ if (iv_len == 12) {
+ memcpy(ks->iv, iv, 12);
+ memset(&ks->iv[12], 0, 4);
+ } else if (iv_len == 16) {
+ memcpy(ks->iv, iv, 16);
+ } else {
+ exit(EXIT_FAILURE);
+ }
+ OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptInit_ex)(ks->ctx, oqs_aes_128_ctr(), NULL, ks->key, ks->iv));
+}
+
+static void AES128_CTR_inc_ivu64(uint64_t iv, void *schedule) {
+ OQS_EXIT_IF_NULLPTR(schedule, "OpenSSL");
+ struct key_schedule *ks = (struct key_schedule *) schedule;
+ br_enc64be(ks->iv, iv);
+ memset(&ks->iv[8], 0, 8);
+ OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptInit_ex)(ks->ctx, oqs_aes_128_ctr(), NULL, ks->key, ks->iv));
+}
+
static void AES256_ECB_load_schedule(const uint8_t *key, void **schedule) {
*schedule = malloc(sizeof(struct key_schedule));
OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
@@ -79,11 +140,12 @@ static void AES256_ECB_load_schedule(const uint8_t *key, void **schedule) {
static void AES256_CTR_inc_init(const uint8_t *key, void **schedule) {
*schedule = malloc(sizeof(struct key_schedule));
+ OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
+
struct key_schedule *ks = (struct key_schedule *) *schedule;
EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)();
- assert(ctr_ctx != NULL);
+ OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
- OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
ks->for_ECB = 0;
ks->ctx = ctr_ctx;
memcpy(ks->key, key, 32);
@@ -130,7 +192,7 @@ static void AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_
static void AES256_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)();
- assert(ctr_ctx != NULL);
+ OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
uint8_t iv_ctr[16];
if (iv_len == 12) {
memcpy(iv_ctr, iv, 12);
@@ -164,17 +226,21 @@ static void AES256_CTR_inc_stream_blks(void *schedule, uint8_t *out, size_t out_
}
struct OQS_AES_callbacks aes_default_callbacks = {
- AES128_ECB_load_schedule,
- AES128_free_schedule,
- AES128_ECB_enc,
- AES128_ECB_enc_sch,
- AES256_ECB_load_schedule,
- AES256_CTR_inc_init,
- AES256_CTR_inc_iv,
- AES256_CTR_inc_ivu64,
- AES256_free_schedule,
- AES256_ECB_enc,
- AES256_ECB_enc_sch,
- AES256_CTR_inc_stream_iv,
- AES256_CTR_inc_stream_blks,
+ .AES128_ECB_load_schedule = AES128_ECB_load_schedule,
+ .AES128_CTR_inc_init = AES128_CTR_inc_init,
+ .AES128_CTR_inc_iv = AES128_CTR_inc_iv,
+ .AES128_CTR_inc_ivu64 = AES128_CTR_inc_ivu64,
+ .AES128_free_schedule = AES128_free_schedule,
+ .AES128_ECB_enc = AES128_ECB_enc,
+ .AES128_ECB_enc_sch = AES128_ECB_enc_sch,
+ .AES256_ECB_load_schedule = AES256_ECB_load_schedule,
+ .AES128_CTR_inc_stream_iv = AES128_CTR_inc_stream_iv,
+ .AES256_CTR_inc_init = AES256_CTR_inc_init,
+ .AES256_CTR_inc_iv = AES256_CTR_inc_iv,
+ .AES256_CTR_inc_ivu64 = AES256_CTR_inc_ivu64,
+ .AES256_free_schedule = AES256_free_schedule,
+ .AES256_ECB_enc = AES256_ECB_enc,
+ .AES256_ECB_enc_sch = AES256_ECB_enc_sch,
+ .AES256_CTR_inc_stream_iv = AES256_CTR_inc_stream_iv,
+ .AES256_CTR_inc_stream_blks = AES256_CTR_inc_stream_blks,
};
diff --git a/src/common/ossl_functions.h b/src/common/ossl_functions.h
index aa0ceb127c..438ec1fafa 100644
--- a/src/common/ossl_functions.h
+++ b/src/common/ossl_functions.h
@@ -25,6 +25,7 @@ VOID_FUNC(void, EVP_MD_CTX_free, (EVP_MD_CTX *ctx), (ctx))
FUNC(EVP_MD_CTX *, EVP_MD_CTX_new, (void), ())
FUNC(int, EVP_MD_CTX_reset, (EVP_MD_CTX *ctx), (ctx))
FUNC(const EVP_CIPHER *, EVP_aes_128_ecb, (void), ())
+FUNC(const EVP_CIPHER *, EVP_aes_128_ctr, (void), ())
FUNC(const EVP_CIPHER *, EVP_aes_256_ecb, (void), ())
FUNC(const EVP_CIPHER *, EVP_aes_256_ctr, (void), ())
#if OPENSSL_VERSION_NUMBER >= 0x30000000L
diff --git a/src/common/ossl_helpers.c b/src/common/ossl_helpers.c
index 1c73d8b901..76dccb0ef4 100644
--- a/src/common/ossl_helpers.c
+++ b/src/common/ossl_helpers.c
@@ -18,7 +18,7 @@ static EVP_MD *sha256_ptr, *sha384_ptr, *sha512_ptr,
*sha3_256_ptr, *sha3_384_ptr, *sha3_512_ptr,
*shake128_ptr, *shake256_ptr;
-static EVP_CIPHER *aes128_ecb_ptr, *aes256_ecb_ptr, *aes256_ctr_ptr;
+static EVP_CIPHER *aes128_ecb_ptr, *aes128_ctr_ptr, *aes256_ecb_ptr, *aes256_ctr_ptr;
static void fetch_ossl_objects(void) {
sha256_ptr = OSSL_FUNC(EVP_MD_fetch)(NULL, "SHA256", NULL);
@@ -32,12 +32,13 @@ static void fetch_ossl_objects(void) {
shake256_ptr = OSSL_FUNC(EVP_MD_fetch)(NULL, "SHAKE256", NULL);
aes128_ecb_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-128-ECB", NULL);
+ aes128_ctr_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-128-CTR", NULL);
aes256_ecb_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-256-ECB", NULL);
aes256_ctr_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-256-CTR", NULL);
if (!sha256_ptr || !sha384_ptr || !sha512_ptr || !sha3_256_ptr ||
!sha3_384_ptr || !sha3_512_ptr || !shake128_ptr || !shake256_ptr ||
- !aes128_ecb_ptr || !aes256_ecb_ptr || !aes256_ctr_ptr) {
+ !aes128_ecb_ptr || !aes128_ctr_ptr || !aes256_ecb_ptr || !aes256_ctr_ptr) {
fprintf(stderr, "liboqs warning: OpenSSL initialization failure. Is provider for SHA, SHAKE, AES enabled?\n");
}
}
@@ -61,6 +62,8 @@ static void free_ossl_objects(void) {
shake256_ptr = NULL;
OSSL_FUNC(EVP_CIPHER_free)(aes128_ecb_ptr);
aes128_ecb_ptr = NULL;
+ OSSL_FUNC(EVP_CIPHER_free)(aes128_ctr_ptr);
+ aes128_ctr_ptr = NULL;
OSSL_FUNC(EVP_CIPHER_free)(aes256_ecb_ptr);
aes256_ecb_ptr = NULL;
OSSL_FUNC(EVP_CIPHER_free)(aes256_ctr_ptr);
@@ -75,7 +78,7 @@ void oqs_ossl_destroy(void) {
#else
if (sha256_ptr || sha384_ptr || sha512_ptr || sha3_256_ptr ||
sha3_384_ptr || sha3_512_ptr || shake128_ptr || shake256_ptr ||
- aes128_ecb_ptr || aes256_ecb_ptr || aes256_ctr_ptr) {
+ aes128_ecb_ptr || aes128_ctr_ptr || aes256_ecb_ptr || aes256_ctr_ptr) {
free_ossl_objects();
}
#endif
@@ -235,6 +238,23 @@ const EVP_CIPHER *oqs_aes_128_ecb(void) {
#endif
}
+const EVP_CIPHER *oqs_aes_128_ctr(void) {
+#if OPENSSL_VERSION_NUMBER >= 0x30000000L
+#if defined(OQS_USE_PTHREADS)
+ if (pthread_once(&init_once_control, fetch_ossl_objects)) {
+ return NULL;
+ }
+#else
+ if (!aes128_ctr_ptr) {
+ fetch_ossl_objects();
+ }
+#endif
+ return aes128_ctr_ptr;
+#else
+ return OSSL_FUNC(EVP_aes_128_ctr)();
+#endif
+}
+
const EVP_CIPHER *oqs_aes_256_ecb(void) {
#if OPENSSL_VERSION_NUMBER >= 0x30000000L
#if defined(OQS_USE_PTHREADS)
diff --git a/src/common/ossl_helpers.h b/src/common/ossl_helpers.h
index fe6d34687a..3e1bc9ff25 100644
--- a/src/common/ossl_helpers.h
+++ b/src/common/ossl_helpers.h
@@ -31,6 +31,8 @@ const EVP_MD *oqs_sha3_512(void);
const EVP_CIPHER *oqs_aes_128_ecb(void);
+const EVP_CIPHER *oqs_aes_128_ctr(void);
+
const EVP_CIPHER *oqs_aes_256_ecb(void);
const EVP_CIPHER *oqs_aes_256_ctr(void);
diff --git a/src/common/pqclean_shims/aes.h b/src/common/pqclean_shims/aes.h
index 58ae1e67c9..dc72a9e157 100644
--- a/src/common/pqclean_shims/aes.h
+++ b/src/common/pqclean_shims/aes.h
@@ -12,6 +12,7 @@
#define AESCTR_NONCEBYTES 12
#define AES_BLOCKBYTES 16
+typedef void *aes128ctx;
typedef void *aes256ctx;
#define aes256_ecb_keyexp(r, key) OQS_AES256_ECB_load_schedule((key), (r))
@@ -43,4 +44,12 @@ static inline void aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[
OQS_AES256_free_schedule(state);
}
+static inline void aes128ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[16], uint8_t nonce[12]) {
+ aes128ctx state;
+ OQS_AES128_CTR_inc_init(key, &state);
+ OQS_AES128_CTR_inc_stream_iv(nonce, 12, state, out, outlen);
+ OQS_AES128_free_schedule(state);
+}
+
+
#endif
diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake
index d2d01c4771..f16421fa43 100644
--- a/src/oqsconfig.h.cmake
+++ b/src/oqsconfig.h.cmake
@@ -189,6 +189,16 @@
#cmakedefine OQS_ENABLE_SIG_sphincs_shake_256f_simple_avx2 1
#cmakedefine OQS_ENABLE_SIG_sphincs_shake_256s_simple 1
#cmakedefine OQS_ENABLE_SIG_sphincs_shake_256s_simple_avx2 1
+
+#cmakedefine OQS_ENABLE_SIG_MAYO 1
+#cmakedefine OQS_ENABLE_SIG_mayo_1 1
+#cmakedefine OQS_ENABLE_SIG_mayo_1_avx2 1
+#cmakedefine OQS_ENABLE_SIG_mayo_2 1
+#cmakedefine OQS_ENABLE_SIG_mayo_2_avx2 1
+#cmakedefine OQS_ENABLE_SIG_mayo_3 1
+#cmakedefine OQS_ENABLE_SIG_mayo_3_avx2 1
+#cmakedefine OQS_ENABLE_SIG_mayo_5 1
+#cmakedefine OQS_ENABLE_SIG_mayo_5_avx2 1
///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ALG_ENABLE_DEFINES_END
#cmakedefine OQS_ENABLE_SIG_STFL_XMSS 1
diff --git a/src/sig/mayo/CMakeLists.txt b/src/sig/mayo/CMakeLists.txt
new file mode 100644
index 0000000000..e049f71344
--- /dev/null
+++ b/src/sig/mayo/CMakeLists.txt
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: MIT
+
+# This file was generated by
+# scripts/copy_from_upstream/copy_from_upstream.py
+
+set(_MAYO_OBJS "")
+
+if(OQS_ENABLE_SIG_mayo_1)
+ add_library(mayo_1_opt OBJECT sig_mayo_1.c pqmayo_mayo-1_opt/api.c pqmayo_mayo-1_opt/arithmetic.c pqmayo_mayo-1_opt/mayo.c pqmayo_mayo-1_opt/params.c)
+ target_compile_options(mayo_1_opt PUBLIC -DMAYO_VARIANT=MAYO_1 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL)
+ target_include_directories(mayo_1_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-1_opt)
+ target_include_directories(mayo_1_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(mayo_1_opt PUBLIC -DMAYO_VARIANT=MAYO_1 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL)
+ set(_MAYO_OBJS ${_MAYO_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_1_avx2)
+ add_library(mayo_1_avx2 OBJECT pqmayo_mayo-1_avx2/api.c pqmayo_mayo-1_avx2/arithmetic.c pqmayo_mayo-1_avx2/mayo.c pqmayo_mayo-1_avx2/params.c)
+ target_include_directories(mayo_1_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-1_avx2)
+ target_include_directories(mayo_1_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(mayo_1_avx2 PRIVATE -mavx2)
+ target_compile_options(mayo_1_avx2 PUBLIC -DMAYO_VARIANT=MAYO_1 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL)
+ set(_MAYO_OBJS ${_MAYO_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_2)
+ add_library(mayo_2_opt OBJECT sig_mayo_2.c pqmayo_mayo-2_opt/api.c pqmayo_mayo-2_opt/arithmetic.c pqmayo_mayo-2_opt/mayo.c pqmayo_mayo-2_opt/params.c)
+ target_compile_options(mayo_2_opt PUBLIC -DMAYO_VARIANT=MAYO_2 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL)
+ target_include_directories(mayo_2_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-2_opt)
+ target_include_directories(mayo_2_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(mayo_2_opt PUBLIC -DMAYO_VARIANT=MAYO_2 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL)
+ set(_MAYO_OBJS ${_MAYO_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_2_avx2)
+ add_library(mayo_2_avx2 OBJECT pqmayo_mayo-2_avx2/api.c pqmayo_mayo-2_avx2/arithmetic.c pqmayo_mayo-2_avx2/mayo.c pqmayo_mayo-2_avx2/params.c)
+ target_include_directories(mayo_2_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-2_avx2)
+ target_include_directories(mayo_2_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(mayo_2_avx2 PRIVATE -mavx2)
+ target_compile_options(mayo_2_avx2 PUBLIC -DMAYO_VARIANT=MAYO_2 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL)
+ set(_MAYO_OBJS ${_MAYO_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_3)
+ add_library(mayo_3_opt OBJECT sig_mayo_3.c pqmayo_mayo-3_opt/api.c pqmayo_mayo-3_opt/arithmetic.c pqmayo_mayo-3_opt/mayo.c pqmayo_mayo-3_opt/params.c)
+ target_compile_options(mayo_3_opt PUBLIC -DMAYO_VARIANT=MAYO_3 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+ target_include_directories(mayo_3_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-3_opt)
+ target_include_directories(mayo_3_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(mayo_3_opt PUBLIC -DMAYO_VARIANT=MAYO_3 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+ set(_MAYO_OBJS ${_MAYO_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_3_avx2)
+ add_library(mayo_3_avx2 OBJECT pqmayo_mayo-3_avx2/api.c pqmayo_mayo-3_avx2/arithmetic.c pqmayo_mayo-3_avx2/mayo.c pqmayo_mayo-3_avx2/params.c)
+ target_include_directories(mayo_3_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-3_avx2)
+ target_include_directories(mayo_3_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(mayo_3_avx2 PRIVATE -mavx2)
+ target_compile_options(mayo_3_avx2 PUBLIC -DMAYO_VARIANT=MAYO_3 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+ set(_MAYO_OBJS ${_MAYO_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_5)
+ add_library(mayo_5_opt OBJECT sig_mayo_5.c pqmayo_mayo-5_opt/api.c pqmayo_mayo-5_opt/arithmetic.c pqmayo_mayo-5_opt/mayo.c pqmayo_mayo-5_opt/params.c)
+ target_compile_options(mayo_5_opt PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+ target_include_directories(mayo_5_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-5_opt)
+ target_include_directories(mayo_5_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(mayo_5_opt PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+ set(_MAYO_OBJS ${_MAYO_OBJS} $)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_5_avx2)
+ add_library(mayo_5_avx2 OBJECT pqmayo_mayo-5_avx2/api.c pqmayo_mayo-5_avx2/arithmetic.c pqmayo_mayo-5_avx2/mayo.c pqmayo_mayo-5_avx2/params.c)
+ target_include_directories(mayo_5_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-5_avx2)
+ target_include_directories(mayo_5_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+ target_compile_options(mayo_5_avx2 PRIVATE -mavx2)
+ target_compile_options(mayo_5_avx2 PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+ set(_MAYO_OBJS ${_MAYO_OBJS} $)
+endif()
+
+set(MAYO_OBJS ${_MAYO_OBJS} PARENT_SCOPE)
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/LICENSE b/src/sig/mayo/pqmayo_mayo-1_avx2/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/LICENSE
@@ -0,0 +1,202 @@
+ Apache License
+ Version 2.0, January 2004
+ http://www.apache.org/licenses/
+
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+ 1. Definitions.
+
+ "License" shall mean the terms and conditions for use, reproduction,
+ and distribution as defined by Sections 1 through 9 of this document.
+
+ "Licensor" shall mean the copyright owner or entity authorized by
+ the copyright owner that is granting the License.
+
+ "Legal Entity" shall mean the union of the acting entity and all
+ other entities that control, are controlled by, or are under common
+ control with that entity. For the purposes of this definition,
+ "control" means (i) the power, direct or indirect, to cause the
+ direction or management of such entity, whether by contract or
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
+ outstanding shares, or (iii) beneficial ownership of such entity.
+
+ "You" (or "Your") shall mean an individual or Legal Entity
+ exercising permissions granted by this License.
+
+ "Source" form shall mean the preferred form for making modifications,
+ including but not limited to software source code, documentation
+ source, and configuration files.
+
+ "Object" form shall mean any form resulting from mechanical
+ transformation or translation of a Source form, including but
+ not limited to compiled object code, generated documentation,
+ and conversions to other media types.
+
+ "Work" shall mean the work of authorship, whether in Source or
+ Object form, made available under the License, as indicated by a
+ copyright notice that is included in or attached to the work
+ (an example is provided in the Appendix below).
+
+ "Derivative Works" shall mean any work, whether in Source or Object
+ form, that is based on (or derived from) the Work and for which the
+ editorial revisions, annotations, elaborations, or other modifications
+ represent, as a whole, an original work of authorship. For the purposes
+ of this License, Derivative Works shall not include works that remain
+ separable from, or merely link (or bind by name) to the interfaces of,
+ the Work and Derivative Works thereof.
+
+ "Contribution" shall mean any work of authorship, including
+ the original version of the Work and any modifications or additions
+ to that Work or Derivative Works thereof, that is intentionally
+ submitted to Licensor for inclusion in the Work by the copyright owner
+ or by an individual or Legal Entity authorized to submit on behalf of
+ the copyright owner. For the purposes of this definition, "submitted"
+ means any form of electronic, verbal, or written communication sent
+ to the Licensor or its representatives, including but not limited to
+ communication on electronic mailing lists, source code control systems,
+ and issue tracking systems that are managed by, or on behalf of, the
+ Licensor for the purpose of discussing and improving the Work, but
+ excluding communication that is conspicuously marked or otherwise
+ designated in writing by the copyright owner as "Not a Contribution."
+
+ "Contributor" shall mean Licensor and any individual or Legal Entity
+ on behalf of whom a Contribution has been received by Licensor and
+ subsequently incorporated within the Work.
+
+ 2. Grant of Copyright License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ copyright license to reproduce, prepare Derivative Works of,
+ publicly display, publicly perform, sublicense, and distribute the
+ Work and such Derivative Works in Source or Object form.
+
+ 3. Grant of Patent License. Subject to the terms and conditions of
+ this License, each Contributor hereby grants to You a perpetual,
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+ (except as stated in this section) patent license to make, have made,
+ use, offer to sell, sell, import, and otherwise transfer the Work,
+ where such license applies only to those patent claims licensable
+ by such Contributor that are necessarily infringed by their
+ Contribution(s) alone or by combination of their Contribution(s)
+ with the Work to which such Contribution(s) was submitted. If You
+ institute patent litigation against any entity (including a
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
+ or a Contribution incorporated within the Work constitutes direct
+ or contributory patent infringement, then any patent licenses
+ granted to You under this License for that Work shall terminate
+ as of the date such litigation is filed.
+
+ 4. Redistribution. You may reproduce and distribute copies of the
+ Work or Derivative Works thereof in any medium, with or without
+ modifications, and in Source or Object form, provided that You
+ meet the following conditions:
+
+ (a) You must give any other recipients of the Work or
+ Derivative Works a copy of this License; and
+
+ (b) You must cause any modified files to carry prominent notices
+ stating that You changed the files; and
+
+ (c) You must retain, in the Source form of any Derivative Works
+ that You distribute, all copyright, patent, trademark, and
+ attribution notices from the Source form of the Work,
+ excluding those notices that do not pertain to any part of
+ the Derivative Works; and
+
+ (d) If the Work includes a "NOTICE" text file as part of its
+ distribution, then any Derivative Works that You distribute must
+ include a readable copy of the attribution notices contained
+ within such NOTICE file, excluding those notices that do not
+ pertain to any part of the Derivative Works, in at least one
+ of the following places: within a NOTICE text file distributed
+ as part of the Derivative Works; within the Source form or
+ documentation, if provided along with the Derivative Works; or,
+ within a display generated by the Derivative Works, if and
+ wherever such third-party notices normally appear. The contents
+ of the NOTICE file are for informational purposes only and
+ do not modify the License. You may add Your own attribution
+ notices within Derivative Works that You distribute, alongside
+ or as an addendum to the NOTICE text from the Work, provided
+ that such additional attribution notices cannot be construed
+ as modifying the License.
+
+ You may add Your own copyright statement to Your modifications and
+ may provide additional or different license terms and conditions
+ for use, reproduction, or distribution of Your modifications, or
+ for any such Derivative Works as a whole, provided Your use,
+ reproduction, and distribution of the Work otherwise complies with
+ the conditions stated in this License.
+
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
+ any Contribution intentionally submitted for inclusion in the Work
+ by You to the Licensor shall be under the terms and conditions of
+ this License, without any additional terms or conditions.
+ Notwithstanding the above, nothing herein shall supersede or modify
+ the terms of any separate license agreement you may have executed
+ with Licensor regarding such Contributions.
+
+ 6. Trademarks. This License does not grant permission to use the trade
+ names, trademarks, service marks, or product names of the Licensor,
+ except as required for reasonable and customary use in describing the
+ origin of the Work and reproducing the content of the NOTICE file.
+
+ 7. Disclaimer of Warranty. Unless required by applicable law or
+ agreed to in writing, Licensor provides the Work (and each
+ Contributor provides its Contributions) on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+ implied, including, without limitation, any warranties or conditions
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+ PARTICULAR PURPOSE. You are solely responsible for determining the
+ appropriateness of using or redistributing the Work and assume any
+ risks associated with Your exercise of permissions under this License.
+
+ 8. Limitation of Liability. In no event and under no legal theory,
+ whether in tort (including negligence), contract, or otherwise,
+ unless required by applicable law (such as deliberate and grossly
+ negligent acts) or agreed to in writing, shall any Contributor be
+ liable to You for damages, including any direct, indirect, special,
+ incidental, or consequential damages of any character arising as a
+ result of this License or out of the use or inability to use the
+ Work (including but not limited to damages for loss of goodwill,
+ work stoppage, computer failure or malfunction, or any and all
+ other commercial damages or losses), even if such Contributor
+ has been advised of the possibility of such damages.
+
+ 9. Accepting Warranty or Additional Liability. While redistributing
+ the Work or Derivative Works thereof, You may choose to offer,
+ and charge a fee for, acceptance of support, warranty, indemnity,
+ or other liability obligations and/or rights consistent with this
+ License. However, in accepting such obligations, You may act only
+ on Your own behalf and on Your sole responsibility, not on behalf
+ of any other Contributor, and only if You agree to indemnify,
+ defend, and hold each Contributor harmless for any liability
+ incurred by, or claims asserted against, such Contributor by reason
+ of your accepting any such warranty or additional liability.
+
+ END OF TERMS AND CONDITIONS
+
+ APPENDIX: How to apply the Apache License to your work.
+
+ To apply the Apache License to your work, attach the following
+ boilerplate notice, with the fields enclosed by brackets "{}"
+ replaced with your own identifying information. (Don't include
+ the brackets!) The text should be enclosed in the appropriate
+ comment syntax for the file format. We also recommend that a
+ file or class name and description of purpose be included on the
+ same "printed page" as the copyright notice for easier
+ identification within third-party archives.
+
+ Copyright {yyyy} {name of copyright owner}
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License.
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/NOTICE b/src/sig/mayo/pqmayo_mayo-1_avx2/NOTICE
new file mode 100644
index 0000000000..53da47c21c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/NOTICE
@@ -0,0 +1,13 @@
+Copyright 2023 the MAYO team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+ http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/aes_ctr.h b/src/sig/mayo/pqmayo_mayo-1_avx2/aes_ctr.h
new file mode 100644
index 0000000000..8e41c30f2c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/aes_ctr.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AESCTR_H
+#define AESCTR_H
+
+#include
+#include
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen) {
+ (void) inputByteLen;
+ uint8_t iv[12] = { 0 };
+ aes128ctr_prf(output, outputByteLen, input, iv);
+ return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/api.c b/src/sig/mayo/pqmayo_mayo-1_avx2/api.c
new file mode 100644
index 0000000000..b7e2ef80ce
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_1
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+ return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *sk) {
+ return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *sk) {
+ return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+ const unsigned char *sm, size_t smlen,
+ const unsigned char *pk) {
+ return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *pk) {
+ if (siglen != CRYPTO_BYTES)
+ return -1;
+ return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/api.h b/src/sig/mayo/pqmayo_mayo-1_avx2/api.h
new file mode 100644
index 0000000000..86b7bd545d
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include
+
+#define CRYPTO_SECRETKEYBYTES 24
+#define CRYPTO_PUBLICKEYBYTES 1168
+#define CRYPTO_BYTES 321
+
+#define CRYPTO_ALGNAME "MAYO-1"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+ const unsigned char *sm, size_t smlen,
+ const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef ENABLE_CT_TESTING
+#include
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+ mayo12_m_upper(m_legs, in, out, size);
+#else
+ int m_vecs_stored = 0;
+ for (int r = 0; r < size; r++) {
+ for (int c = r; c < size; c++) {
+ m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+ if (r != c) {
+ m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+ }
+ m_vecs_stored ++;
+ }
+ }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ (void) p;
+ mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ (void) p;
+ mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ (void) p;
+ mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+ #ifndef MAYO_VARIANT
+ const int m_legs = PARAM_m(p) / 32;
+ #else
+ (void) p;
+ #endif
+ const int param_o = PARAM_o(p);
+ const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+ int
+ bs_mat_entries_used = 0;
+ for (int r = 0; r < param_v; r++) {
+ for (int c = r; c < param_v; c++) {
+ if(c==r) {
+ bs_mat_entries_used += 1;
+ continue;
+ }
+ for (int k = 0; k < param_o; k += 1) {
+
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+ vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k));
+ vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k));
+ vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k));
+#else
+ m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+ m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+ }
+ bs_mat_entries_used += 1;
+ }
+ }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+ (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int param_o = PARAM_o(p);
+ const int param_k = PARAM_k(p);
+
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+ param_k, param_n - param_o, param_o);
+
+ mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+ mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+ Y, param_k, param_n - param_o,
+ param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+ (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ const int param_v = PARAM_v(p);
+ const int param_o = PARAM_o(p);
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int m_legs = PARAM_m(p) / 32;
+ mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+ mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2]
+// [ 0 P3 ] [S2] [ P3*S2]
+// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+// [ P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+ const int m, const int v, const int o, const int k, uint64_t *SPS) {
+ (void) m;
+#if MAYO_AVX
+ const int n = o + v;
+
+ /* Old approach which is constant time but doesn't have to be */
+ unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+ unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+ unsigned char *s1_write = S1;
+ unsigned char *s2_write = S2;
+
+ for (int r=0; r < k; r++)
+ {
+ for (int c = 0; c < n; c++)
+ {
+ if(c < v){
+ *(s1_write++) = S[r*n + c];
+ } else {
+ *(s2_write++) = S[r*n + c];
+ }
+ }
+ }
+
+ alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+
+ mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+
+ mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+ mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+ mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else
+ NOT IMPLEMENTED
+#endif
+#else
+ const int n = o + v;
+ alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+ mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+ mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+ const unsigned char *y, const unsigned char *r,
+ unsigned char *x, int k, int o, int m, int A_cols) {
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ unsigned char finished;
+ int col_upper_bound;
+ unsigned char correct_column;
+
+ // x <- r
+ for (int i = 0; i < k * o; i++) {
+ x[i] = r[i];
+ }
+
+ // compute Ar;
+ unsigned char Ar[M_MAX];
+ for (int i = 0; i < m; i++) {
+ A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+ }
+ mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+ // move y - Ar to last column of matrix A
+ for (int i = 0; i < m; i++) {
+ A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+ }
+
+ EF(A, m, k * o + 1);
+
+ // check if last row of A (excluding the last entry of y) is zero
+ unsigned char full_rank = 0;
+ for (int i = 0; i < A_cols - 1; i++) {
+ full_rank |= A[(m - 1) * A_cols + i];
+ }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+ if (full_rank == 0) {
+ return 0;
+ }
+
+ // back substitution in constant time
+ // the index of the first nonzero entry in each row is secret, which makes
+ // things less efficient
+
+ for (int row = m - 1; row >= 0; row--) {
+ finished = 0;
+ col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+ // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+ for (int col = row; col <= col_upper_bound; col++) {
+
+ // Compare two chars in constant time.
+ // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+ correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+ unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+ x[col] ^= u;
+
+ for (int i = 0; i < row; i += 8) {
+ uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8)
+ ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+ ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+ ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+ tmp = mul_fx8(u, tmp);
+
+ A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf;
+ A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+ A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+ A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+ A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+ A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+ A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+ A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+ }
+
+ finished = finished | correct_column;
+ }
+ }
+ return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include
+#include
+#include
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+ #include
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+ #include
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+ #include
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+ #include
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+ #include
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+ const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_128.h
new file mode 100644
index 0000000000..27b367e940
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_128.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include
+#include
+#include
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+ out[4] = in[4];
+ out[5] = in[5];
+ out[6] = in[6];
+ out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+ acc[4] ^= in[4];
+ acc[5] ^= in[5];
+ acc[6] ^= in[6];
+ acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<8;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+ }
+}
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<8;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+ }
+}
+
+static
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ for(int i=0; i < 8;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], a);
+ }
+}
+
+static
+ inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8);
+ m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+ m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8);
+ m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8);
+ m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8);
+ m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8);
+ m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8);
+ m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8);
+ m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8);
+ m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8);
+ m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8);
+ m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8);
+ m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8);
+ m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8);
+ vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_64.h
new file mode 100644
index 0000000000..9f7535c878
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_64.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include
+#include
+#include
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+}
+
+static
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ for(int i=0; i < 4;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], a);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<4;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+ }
+}
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<4;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+ }
+}
+
+static
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4);
+ m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+ m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4);
+ m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4);
+ m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4);
+ m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4);
+ m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4);
+ m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4);
+ m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4);
+ m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4);
+ m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4);
+ m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4);
+ m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4);
+ m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4);
+ vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_96.h
new file mode 100644
index 0000000000..86359679fb
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_96.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include
+#include
+#include
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+ out[4] = in[4];
+ out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+ acc[4] ^= in[4];
+ acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<6;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<6;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+ }
+}
+
+static
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ for(int i=0; i < 6;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], a);
+ }
+}
+
+static
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6);
+ m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+ m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6);
+ m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6);
+ m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6);
+ m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6);
+ m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6);
+ m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6);
+ m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6);
+ m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6);
+ m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6);
+ m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6);
+ m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6);
+ m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6);
+ vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_common.h
new file mode 100644
index 0000000000..eeb13dc0bd
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_common.h
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include
+#include
+
+#define K_OVER_2 ((K_MAX+1)/2)
+
+static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d,
+ 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09,
+ 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01
+};
+
+//
+// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper!
+//
+static inline __m256i tbl32_gf16_multab2( uint8_t b ) {
+
+ __m256i bx = _mm256_set1_epi16( b & 0xf );
+ __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+ const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+ const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+ const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+ const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+ __m256i mask_1 = _mm256_set1_epi16(1);
+ __m256i mask_4 = _mm256_set1_epi16(4);
+ __m256i mask_0 = _mm256_setzero_si256();
+
+ return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+ ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+ ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+ ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) {
+ return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f);
+}
+
+static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) {
+ __m256i multab_l = tbl32_gf16_multab2( b );
+ __m256i multab_h = _mm256_slli_epi16( multab_l, 4 );
+
+ return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) );
+}
+
+static
+inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){
+ // build multiplication tables
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ for (size_t c = 0; c < O_MAX; c+=2)
+ {
+ O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4);
+ }
+ }
+}
+
+
+static
+inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){
+ // build multiplication tables
+ size_t r;
+ for (size_t c = 0; c < V_MAX; c++)
+ {
+ for (r = 0; r+1 < K_MAX; r+= 2)
+ {
+ V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4);
+ }
+#if K_MAX % 2 == 1
+ V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]);
+#endif
+ }
+}
+
+static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = {
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+ 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+ 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d,
+ 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d,
+ 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02,
+ 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02,
+ 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09,
+ 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09,
+ 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06,
+ 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06,
+ 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04,
+ 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04,
+ 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b,
+ 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b,
+ 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01,
+ 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01,
+ 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e,
+ 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e,
+ 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c,
+ 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c,
+ 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03,
+ 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03,
+ 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08,
+ 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08,
+ 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07,
+ 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07,
+ 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05,
+ 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05,
+ 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a,
+ 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a};
+
+
+static
+inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) {
+ size_t r;
+ for (size_t c = 0; c < V_MAX; c++)
+ {
+ for (r = 0; r+1 < K_MAX; r+= 2)
+ {
+ S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c]))
+ ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4);
+ }
+#if K_MAX % 2 == 1
+ S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c]));
+#endif
+ }
+}
+
+static
+inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) {
+ // build multiplication tables
+ size_t r;
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ for (r = 0; r+1 < K_MAX; r+= 2)
+ {
+ S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c]))
+ ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4);
+ }
+#if K_MAX % 2 == 1
+ S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ;
+#endif
+ }
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+ uint64_t mask_msb = 0x8888888888888888ULL;
+ uint64_t a_msb;
+ uint64_t a64 = a;
+ uint64_t b32 = b;
+ uint64_t r64 = a64 * (b32 & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 1) & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 2) & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 3) & 1);
+
+ return r64;
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form.h
new file mode 100644
index 0000000000..fa69de0ab2
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+
+//
+// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/.
+//
+static inline __m256i tbl32_gf16_multab( uint8_t b ) {
+ __m256i bx = _mm256_set1_epi16( b & 0xf );
+ __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+ const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+ const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+ const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+ const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+ __m256i mask_1 = _mm256_set1_epi16(1);
+ __m256i mask_4 = _mm256_set1_epi16(4);
+ __m256i mask_0 = _mm256_setzero_si256();
+
+ return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+ ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+ ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+ ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+/* put matrix in row echelon form with ones on first nonzero entries in constant time*/
+static inline void EF(unsigned char *A, int _nrows, int _ncols) {
+
+ (void) _nrows;
+ (void) _ncols;
+
+ #define nrows M_MAX
+ #define ncols (K_MAX * O_MAX + 1)
+
+ #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32)
+ #define MAX_COLS (AVX_REGS_PER_ROW * 32)
+
+ __m256i _pivot_row[AVX_REGS_PER_ROW];
+ __m256i A_avx[AVX_REGS_PER_ROW* M_MAX];
+
+ unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row;
+ unsigned char* A_bytes = (unsigned char*) A_avx;
+
+ // load A in the tail of AVX2 registers
+ for (int i = 0; i < nrows; i++) {
+ for (int j = 0; j < ncols; j++)
+ {
+ A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ];
+ }
+ }
+
+ // pivot row is secret, pivot col is not
+ unsigned char inverse;
+ int pivot_row = 0;
+ int pivot_col = MAYO_MAX(MAX_COLS - ncols,0);
+ for (; pivot_col < MAX_COLS-128; pivot_col++) {
+ #include "echelon_form_loop.h"
+ }
+ for (; pivot_col < MAX_COLS-96; pivot_col++) {
+ #include "echelon_form_loop.h"
+ }
+ for (; pivot_col < MAX_COLS-64; pivot_col++) {
+ #include "echelon_form_loop.h"
+ }
+ for (; pivot_col < MAX_COLS-32; pivot_col++) {
+ #include "echelon_form_loop.h"
+ }
+ for (; pivot_col < MAX_COLS; pivot_col++) {
+ #include "echelon_form_loop.h"
+ }
+
+ // write the matrix A back
+ for (int i = 0; i < nrows; i++) {
+ for (int j = 0; j < ncols; j++) {
+ A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j];
+ }
+ }
+ mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32);
+ mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form_loop.h
new file mode 100644
index 0000000000..b8b29741c4
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form_loop.h
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+
+int pivot_col_rounded = pivot_col/32;
+
+int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS);
+int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols);
+/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/
+
+/* zero out pivot row */
+for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) {
+ _pivot_row[i] = _mm256_set1_epi8(0);
+}
+
+/* try to get a pivot row in constant time */
+unsigned char pivot = 0;
+uint32_t pivot_is_zero = -1;
+for (int row = pivot_row_lower_bound;
+ row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+ uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row);
+ uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row);
+ __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) );
+ for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+ _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j];
+ }
+ pivot = pivot_row_bytes[pivot_col];
+ pivot_is_zero = ~ct_compare_32((int) pivot, 0);
+}
+
+/* multiply pivot row by inverse of pivot */
+inverse = inverse_f(pivot);
+__m256i inverse_multab = tbl32_gf16_multab(inverse);
+
+for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+ _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]);
+}
+
+/* conditionally write pivot row to the correct row, if there is a nonzero pivot */
+/* eliminate entries below pivot */
+for (int row = pivot_row_lower_bound; row < nrows; row++) {
+ unsigned char below_pivot = (unsigned char) (ct_is_greater_than(row, pivot_row));
+ unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col];
+
+ __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim);
+ if (row <= pivot_row_upper_bound) {
+ __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero);
+ for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) {
+ A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^
+ _mm256_shuffle_epi8(multab, _pivot_row[col]);
+ }
+ } else {
+ for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+ A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]);
+ }
+ }
+}
+
+pivot_row += (-(int32_t)(~pivot_is_zero));
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef ENABLE_CT_TESTING
+#include
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+ int i;
+ for (i = 0; i < mdeclen / 2; ++i) {
+ *mdec++ = m[i] & 0xf;
+ *mdec++ = m[i] >> 4;
+ }
+
+ if (mdeclen % 2 == 1) {
+ *mdec++ = m[i] & 0x0f;
+ }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+ int i;
+ for (i = 0; i < mlen / 2; ++i, m += 2) {
+ menc[i] = (*m) | (*(m + 1) << 4);
+ }
+
+ if (mlen % 2 == 1) {
+ menc[i] = (*m);
+ }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+ #ifndef ENABLE_PARAMS_DYNAMIC
+ (void) p;
+ #endif
+
+ const uint64_t *vPv = _vPv;
+ uint64_t temp[M_MAX/16] = {0};
+ unsigned char *temp_bytes = (unsigned char *) temp;
+ int k = 0;
+ for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+ for (int j = i; j < PARAM_k(p); j++) {
+ // multiply by X (shift up 4 bits)
+ unsigned char top = temp[k] >> 60;
+ temp[k] <<= 4;
+ k--;
+ for(; k>=0; k--){
+ temp[k+1] ^= temp[k] >> 60;
+ temp[k] <<= 4;
+ }
+ // reduce mod f(X)
+ for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+ if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+ temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+ temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+ }
+ else {
+#ifdef TARGET_BIG_ENDIAN
+ temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+ temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+ }
+ }
+
+ // extract from vPv and add
+ for(k=0; k < PARAM_m(p)/16; k ++){
+ temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+ }
+ k--;
+ }
+ }
+
+ // add to y
+ for (int i = 0; i < PARAM_m(p); i+=2)
+ {
+#ifdef TARGET_BIG_ENDIAN
+ y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+ y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+ y[i] = t[i] ^ (temp_bytes[i/2] & 0xF);
+ y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+ }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+ static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+ static const uint64_t even_bytes = 0x00ff00ff00ff00ff;
+ static const uint64_t even_2bytes = 0x0000ffff0000ffff;
+ static const uint64_t even_half = 0x00000000ffffffff;
+
+ for (size_t i = 0; i < 16; i+=2)
+ {
+ uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles;
+ M[i ] ^= t << 4;
+ M[i+1] ^= t;
+ }
+
+ for (size_t i = 0; i < 16; i+=4)
+ {
+ uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes;
+ uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes;
+ M[i ] ^= (t0 << 8);
+ M[i+1] ^= (t1 << 8);
+ M[i+2] ^= t0;
+ M[i+3] ^= t1;
+ }
+
+ for (size_t i = 0; i < 4; i++)
+ {
+ uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes;
+ uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+ M[i ] ^= t0 << 16;
+ M[i+ 8] ^= t1 << 16;
+ M[i+ 4] ^= t0;
+ M[i+12] ^= t1;
+ }
+
+ for (size_t i = 0; i < 8; i++)
+ {
+ uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half;
+ M[i ] ^= t << 32;
+ M[i+8] ^= t;
+ }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+ #ifndef ENABLE_PARAMS_DYNAMIC
+ (void) p;
+ #endif
+
+ const uint64_t *VtL = _VtL;
+ int bits_to_shift = 0;
+ int words_to_shift = 0;
+ uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+ size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+ const uint64_t *Mi, *Mj;
+
+ for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+ for (int j = PARAM_k(p) - 1; j >= i; --j) {
+ // add the M_i and M_j to A, shifted "down" by l positions
+ Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+ for (int c = 0; c < PARAM_o(p); c++) {
+ for (int k = 0; k < PARAM_m(p)/16; k++)
+ {
+ A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+ if(bits_to_shift > 0){
+ A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+ }
+ }
+ }
+
+ if (i != j) {
+ Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+ for (int c = 0; c < PARAM_o(p); c++) {
+ for (int k = 0; k < PARAM_m(p)/16; k++)
+ {
+ A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+ if(bits_to_shift > 0){
+ A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+ }
+ }
+ }
+ }
+
+ bits_to_shift += 4;
+ if(bits_to_shift == 64){
+ words_to_shift ++;
+ bits_to_shift = 0;
+ }
+ }
+ }
+
+ for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+ {
+ transpose_16x16_nibbles(A + c);
+ }
+
+ unsigned char tab[F_TAIL_LEN*4] = {0};
+ for (size_t i = 0; i < F_TAIL_LEN; i++)
+ {
+ tab[4*i] = mul_f(PARAM_f_tail(p)[i],1);
+ tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+ tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+ tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+ }
+
+ uint64_t low_bit_in_nibble = 0x1111111111111111;
+
+ for (size_t c = 0; c < A_width; c+= 16)
+ {
+ for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+ {
+ size_t pos = (r/16)*A_width + c + (r%16);
+ uint64_t t0 = A[pos] & low_bit_in_nibble;
+ uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+ uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+ uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+ for (size_t t = 0; t < F_TAIL_LEN; t++)
+ {
+ A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+ }
+ }
+ }
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i)
+ A[i] = BSWAP64(A[i]);
+#endif
+
+ for (int r = 0; r < PARAM_m(p); r+=16)
+ {
+ for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+ {
+ for (size_t i = 0; i < 16; i++)
+ {
+ decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+ }
+ }
+ }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+ int ret = 0;
+
+ ret = mayo_keypair_compact(p, pk, sk);
+ if (ret != MAYO_OK) {
+ goto err;
+ }
+
+err:
+ return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *csk) {
+ int ret = MAYO_OK;
+ unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+ unsigned char y[M_MAX]; // secret data
+ unsigned char salt[SALT_BYTES_MAX]; // not secret data
+ unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+ Vdec[N_MINUS_O_MAX * K_MAX]; // secret data
+ unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data
+ unsigned char x[K_MAX * N_MAX]; // not secret data
+ unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data
+ unsigned char s[K_MAX * N_MAX]; // not secret data
+ const unsigned char *seed_sk;
+ unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+ alignas(32) sk_t sk; // secret data
+ unsigned char Ox[N_MINUS_O_MAX]; // secret data
+ // unsigned char Mdigest[DIGEST_BYTES];
+ unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+ unsigned char *ctrbyte;
+ unsigned char *vi;
+
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int param_o = PARAM_o(p);
+ const int param_k = PARAM_k(p);
+ const int param_m_bytes = PARAM_m_bytes(p);
+ const int param_v_bytes = PARAM_v_bytes(p);
+ const int param_r_bytes = PARAM_r_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ const int param_A_cols = PARAM_A_cols(p);
+ const int param_digest_bytes = PARAM_digest_bytes(p);
+ const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+ const int param_salt_bytes = PARAM_salt_bytes(p);
+
+ ret = mayo_expand_sk(p, csk, &sk);
+ if (ret != MAYO_OK) {
+ goto err;
+ }
+
+ seed_sk = csk;
+ decode(sk.o, O, (param_n - param_o) * param_o);
+
+ // hash message
+ shake256(tmp, param_digest_bytes, m, mlen);
+
+ uint64_t *P1 = sk.p;
+ uint64_t *L = P1 + (param_P1_bytes/8);
+ alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < param_P1_bytes / 8; ++i) {
+ P1[i] = BSWAP64(P1[i]);
+ }
+ for (int i = 0; i < param_P2_bytes / 8; ++i) {
+ L[i] = BSWAP64(L[i]);
+ }
+#endif
+
+ // choose the randomizer
+ #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+ randombytes(tmp + param_digest_bytes, param_salt_bytes);
+ #else
+ if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+ ret = MAYO_ERR;
+ goto err;
+ }
+ #endif
+
+ // hashing to salt
+ memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+ param_sk_seed_bytes);
+ shake256(salt, param_salt_bytes, tmp,
+ param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+ // hashing to t
+ memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+ ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+ shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+ decode(tenc, t, param_m); // may not be necessary
+
+ for (int ctr = 0; ctr <= 255; ++ctr) {
+ *ctrbyte = (unsigned char)ctr;
+
+ shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+ param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+ // decode the v_i vectors
+ for (int i = 0; i <= param_k - 1; ++i) {
+ decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+ param_n - param_o);
+ }
+
+ // compute all the V * L matrices.
+ // compute all the V * P1 * V^T matrices.
+ alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+ V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+ compute_rhs(p, Y, t, y);
+ compute_A(p, Mtmp, A);
+
+ decode(V + param_k * param_v_bytes, r,
+ param_k *
+ param_o);
+ if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+ break;
+ } else {
+ memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+ }
+ }
+
+ // s is already 0
+ // TODO: optimize this?
+ for (int i = 0; i <= param_k - 1; ++i) {
+ vi = Vdec + i * (param_n - param_o);
+ mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+ mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+ memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+ }
+ encode(s, sig, param_n * param_k);
+ memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+ *siglen = param_sig_bytes;
+err:
+ mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+ mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+ mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+ mayo_secure_clear(r, K_MAX * O_MAX + 1);
+ mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+ mayo_secure_clear(&sk, sizeof(sk_t));
+ mayo_secure_clear(Ox, N_MINUS_O_MAX);
+ mayo_secure_clear(tmp,
+ DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+ return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+ size_t *smlen, const unsigned char *m,
+ size_t mlen, const unsigned char *csk) {
+ int ret = MAYO_OK;
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ size_t siglen = param_sig_bytes;
+ ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+ if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+ goto err;
+
+ memmove(sm + param_sig_bytes, m, mlen);
+ *smlen = siglen + mlen;
+err:
+ return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+ size_t *mlen, const unsigned char *sm,
+ size_t smlen, const unsigned char *pk) {
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ if (smlen < (size_t)param_sig_bytes) {
+ return MAYO_ERR;
+ }
+ int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+ pk);
+
+ if (result == MAYO_OK) {
+ *mlen = smlen - param_sig_bytes;
+ memmove(m, sm + param_sig_bytes, *mlen);
+ }
+
+ return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+ unsigned char *csk) {
+ int ret = MAYO_OK;
+ unsigned char *seed_sk = csk;
+ unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+ alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+ alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+ unsigned char *seed_pk;
+ unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+ const int param_m = PARAM_m(p);
+ const int param_v = PARAM_v(p);
+ const int param_o = PARAM_o(p);
+ const int param_O_bytes = PARAM_O_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+ const int param_P3_bytes = PARAM_P3_bytes(p);
+ const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+ const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+ // seed_sk $←- B^(sk_seed bytes)
+ #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+ randombytes(seed_sk, param_sk_seed_bytes);
+ #else
+ if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+ ret = MAYO_ERR;
+ goto err;
+ }
+ #endif
+
+ // S ← shake256(seedsk, pk seed bytes + O bytes)
+ shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+ param_sk_seed_bytes);
+ // seed_pk ← s[0 : pk_seed_bytes]
+ seed_pk = S;
+
+ // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+ decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+ // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+ PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+ param_pk_seed_bytes);
+
+
+ int m_legs = param_m / 32;
+
+ uint64_t *P1 = P;
+ uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+ // compute P3 = O^t * (P1*O + P2)
+ Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+ // store seed_pk in cpk
+ memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+ alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+ // compute Upper(P3) and store in cpk
+ m_upper(m_legs, P3, P3_upper, param_o);
+
+ memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+ mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+ mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+ return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+ unsigned char *pk) {
+ #ifdef MAYO_VARIANT
+ (void)p;
+ #endif
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+ const int param_P3_bytes = PARAM_P3_bytes(p);
+ const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+ PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+ pk += param_P1_bytes + param_P2_bytes;
+ memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+ return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+ sk_t *sk) {
+ int ret = MAYO_OK;
+ unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+ uint64_t *P = sk->p;
+ unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+ const int param_o = PARAM_o(p);
+ const int param_v = PARAM_v(p);
+ const int param_O_bytes = PARAM_O_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+ const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+ const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+ const unsigned char *seed_sk = csk;
+ unsigned char *seed_pk = S;
+
+ shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+ param_sk_seed_bytes);
+ decode(S + param_pk_seed_bytes, O,
+ param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+ // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+ PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+ param_pk_seed_bytes);
+
+ uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+ P[i] = BSWAP64(P[i]);
+ }
+#endif
+
+ uint64_t *P1 = P;
+ // compute L_i = (P1 + P1^t)*O + P2
+ uint64_t *L = P2;
+ P1P1t_times_O(p, P1, O, L);
+
+ // write to sk
+ memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+ P[i] = BSWAP64(P[i]);
+ }
+#endif
+
+ mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+ mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+ return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+ size_t mlen, const unsigned char *sig,
+ const unsigned char *cpk) {
+ unsigned char tEnc[M_BYTES_MAX];
+ unsigned char t[M_MAX];
+ unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+ unsigned char s[K_MAX * N_MAX];
+ alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+ unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int param_v = PARAM_v(p);
+ const int param_o = PARAM_o(p);
+ const int param_k = PARAM_k(p);
+ const int param_m_bytes = PARAM_m_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+ const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ const int param_digest_bytes = PARAM_digest_bytes(p);
+ const int param_salt_bytes = PARAM_salt_bytes(p);
+
+ int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+ if (ret != MAYO_OK) {
+ return MAYO_ERR;
+ }
+
+ uint64_t *P1 = pk;
+ uint64_t *P2 = pk + (param_P1_bytes / 8);
+ uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < param_P1_bytes / 8; ++i) {
+ P1[i] = BSWAP64(P1[i]);
+ }
+ for (int i = 0; i < param_P2_bytes / 8; ++i) {
+ P2[i] = BSWAP64(P2[i]);
+ }
+ for (int i = 0; i < param_P3_bytes / 8; ++i) {
+ P3[i] = BSWAP64(P3[i]);
+ }
+#endif
+
+ // hash m
+ shake256(tmp, param_digest_bytes, m, mlen);
+
+ // compute t
+ memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+ param_salt_bytes);
+ shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+ decode(tEnc, t, param_m);
+
+ // decode s
+ decode(sig, s, param_k * param_n);
+
+ // Compute S*P*S^T
+ alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+ m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+ param_v, param_o, param_k, SPS);
+
+ // combine the vectors in SPS and reduce mod f(X)
+ compute_rhs(p, SPS, y, y);
+
+ if (memcmp(y, t, param_m) == 0) {
+ return MAYO_OK; // good signature
+ }
+ return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include
+#include
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64 \
+ { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3
+#define F_TAIL_96 \
+ { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x
+#define F_TAIL_128 \
+ { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+ int m;
+ int n;
+ int o;
+ int k;
+ int q;
+ const unsigned char *f_tail;
+ int m_bytes;
+ int O_bytes;
+ int v_bytes;
+ int r_bytes;
+ int R_bytes;
+ int P1_bytes;
+ int P2_bytes;
+ int P3_bytes;
+ int csk_bytes;
+ int esk_bytes;
+ int cpk_bytes;
+ int epk_bytes;
+ int sig_bytes;
+ int salt_bytes;
+ int sk_seed_bytes;
+ int digest_bytes;
+ int pk_seed_bytes;
+ const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+ uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+ uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+ size_t *smlen, const unsigned char *m,
+ size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+ size_t *mlen, const unsigned char *sm,
+ size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+ unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+ unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+ sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+ size_t mlen, const unsigned char *sig,
+ const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/mem.h b/src/sig/mayo/pqmayo_mayo-1_avx2/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include
+#include
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+ int32_t diff = b - a;
+ return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+ int64_t diff = ((int64_t) b) - ((int64_t) a);
+ return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+ return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+ return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+ return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include
+/**
+ * Clears and frees allocated memory.
+ *
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+ OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ *
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+ OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/params.c b/src/sig/mayo/pqmayo_mayo-1_avx2/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+ const mayo_params_t nm = { \
+ .m = PARAM_JOIN2(nm, m), \
+ .n = PARAM_JOIN2(nm, n), \
+ .o = PARAM_JOIN2(nm, o), \
+ .k = PARAM_JOIN2(nm, k), \
+ .q = PARAM_JOIN2(nm, q), \
+ .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+ .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+ .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+ .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+ .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+ .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+ .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+ .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+ .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+ .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+ .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+ .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+ .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+ .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+ .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+ .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+ .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+ .name = #nm \
+ };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_128.h
new file mode 100644
index 0000000000..27b416adce
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_128.h
@@ -0,0 +1,524 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_128_H
+#define SHUFFLE_ARITHMETIC_128_H
+
+#include
+#include
+#include
+#include
+
+
+// P1*0 -> P1: v x v, O: v x o
+static
+inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ for (size_t c = r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ cols_used ++;
+ __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ acc[(2*r*O_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+ acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0;
+ acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1;
+ }
+ }
+}
+
+static
+inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+ const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+ acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1;
+ }
+ }
+}
+
+
+static
+inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ cols_used += 1;
+ size_t pos = r;
+ for (size_t c = 0; c < r; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ pos += (V_MAX -c - 1);
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ for (size_t c = r+1; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k ));
+ __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1));
+ __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2));
+ __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3));
+
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k), acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1);
+ }
+ }
+}
+
+
+static
+inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+ const __m256i *L = (__m256i *) _L;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+
+static
+inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+
+ for (c=r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+static
+inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+ const __m256i *Pv = (__m256i *) _Pv;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static
+inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P1 = (__m256i *) _P1;
+ const __m256i *P2 = (__m256i *) _P2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t P1_cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+
+
+ // P1 * S1
+ for (c=r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ P1_cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // P2 * S2
+ for (c=0; c < O_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static
+inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P3 = (__m256i *) _P3;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+
+ for (c=r; c < O_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+
+static
+inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+ mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+ const __m256i *PS2 = (__m256i *) _PS2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_64.h
new file mode 100644
index 0000000000..defff86f8f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_64.h
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_64_H
+#define SHUFFLE_ARITHMETIC_64_H
+
+#include
+#include
+#include
+#include
+
+// P1*0 -> P1: v x v, O: v x o
+static
+inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[O_MAX] = {0};
+ for (size_t c = r; c < V_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+ temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t;
+ }
+ }
+}
+
+
+static
+inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+
+ const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[O_MAX] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd);
+ temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+ }
+ }
+}
+
+static
+inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[O_MAX] = {0};
+ cols_used += 1;
+ size_t pos = r;
+ for (size_t c = 0; c < r; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1 + pos);
+ pos += (V_MAX -c - 1);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+ temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+ }
+ }
+
+ for (size_t c = r+1; c < V_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+ temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+ }
+ }
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k ));
+ __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1));
+
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256(acc + (r*O_MAX + k ), acc0 ^ temp[k ] ^ _mm256_slli_epi16(t,4));
+ _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t);
+ }
+ }
+}
+
+
+static
+inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+ const __m256i *L = (__m256i *) _L;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+
+static
+inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+ const __m256i *Pv = (__m256i *) _Pv;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+static
+inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+
+ for (c=r; c < V_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular
+// same as mayo_12_P1_times_Vt_avx2
+static
+inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){
+ mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+ mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+ const __m256i *PS2 = (__m256i *) _PS2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static
+inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P2 = (__m256i *) _P2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+
+ for (c=0; c < O_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P2 + cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static
+inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P1 = (__m256i *) _P1;
+ const __m256i *P2 = (__m256i *) _P2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t P1_cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+
+
+ // P1 * S1
+ for (c=r; c < V_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ P1_cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even);
+ }
+ }
+
+ // P2 * S2
+ for (c=0; c < O_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static
+inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P3 = (__m256i *) _P3;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+
+ for (c=r; c < O_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P3 + cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+
+static inline
+void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+ (void) size;
+ int m_vecs_stored = 0;
+
+ for (int r = 0; r < O_MAX; ++r) {
+ const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r));
+ __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+ _out[0] = _in[0];
+ m_vecs_stored++;
+ for (int c = r + 1; c < O_MAX; ++c) {
+ const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c));
+ const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r));
+ _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+ _out[0] = _in2[0] ^ _in3[0];
+ m_vecs_stored++;
+ }
+ }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_96.h
new file mode 100644
index 0000000000..9b3a69d567
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_96.h
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_96_H
+#define SHUFFLE_ARITHMETIC_96_H
+
+#include
+#include
+#include
+#include
+
+
+// P1*0 -> P1: v x v, O: v x o
+static
+inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){
+
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ for (size_t c = r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+ }
+}
+
+static
+inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){
+
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+ }
+}
+
+static
+inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ cols_used ++;
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ size_t pos = r;
+ for (size_t c = 0; c < r; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ pos += (V_MAX -c - 1);
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ for (size_t c = r+1; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+ }
+}
+
+static
+inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){
+
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k;
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+static
+inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k,c;
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (c = r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+static
+inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k;
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+// P2*S2 -> P2: v x o, S2: o x k
+static
+inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k,c;
+ size_t P1_cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ // P1 times S1
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (c=r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ P1_cols_used++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // P2 times S2
+ for (c=0; c < O_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static
+inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k,c;
+ size_t cols_used = 0;
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (c=r; c < O_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+static
+inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){
+ mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc);
+}
+
+static
+inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k;
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_avx2/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+ // carryless multiply
+ unsigned char p;
+ p = (a & 1)*b;
+ p ^= (a & 2)*b;
+ p ^= (a & 4)*b;
+ p ^= (a & 8)*b;
+
+ // reduce mod x^4 + x + 1
+ unsigned char top_p = p & 0xf0;
+ unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+ return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+ // carryless multiply
+ uint64_t p;
+ p = (a & 1)*b;
+ p ^= (a & 2)*b;
+ p ^= (a & 4)*b;
+ p ^= (a & 8)*b;
+
+ // reduce mod x^4 + x + 1
+ uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+ uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+ return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+ return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+ return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+ return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+ // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+ // 10, 4, 3, 8}; return table[a & 15];
+
+ unsigned char a2 = mul_f(a, a);
+ unsigned char a4 = mul_f(a2, a2);
+ unsigned char a8 = mul_f(a4, a4);
+ unsigned char a6 = mul_f(a2, a4);
+ unsigned char a14 = mul_f(a8, a6);
+
+ return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+ const unsigned char *b, int n, int m) {
+ unsigned char ret = 0;
+ for (int i = 0; i < n; ++i, b += m) {
+ ret = add_f(mul_f(a[i], *b), ret);
+ }
+ return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+ unsigned char *c, int colrow_ab, int row_a, int col_b) {
+ for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+ for (int j = 0; j < col_b; ++j, ++c) {
+ *c = lincomb(a, b + j, colrow_ab, col_b);
+ }
+ }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+ unsigned char *c, int m, int n) {
+ for (int i = 0; i < m; ++i) {
+ for (int j = 0; j < n; ++j) {
+ *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+ }
+ }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+ uint64_t *out) {
+ for (int i = 0; i < m_legs * 2; i++) {
+ out[i] = in[i];
+ }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+ uint64_t *acc) {
+ for (int i = 0; i < m_legs * 2; i++) {
+ acc[i] ^= in[i];
+ }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+ uint32_t a_msb;
+ uint32_t a32 = a;
+ uint32_t b32 = b;
+ uint32_t r32 = a32 * (b32 & 1);
+
+ a_msb = a32 & 0x88888888; // MSB, 3rd bits
+ a32 ^= a_msb; // clear MSB
+ a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+ r32 ^= (a32) * ((b32 >> 1) & 1);
+
+ a_msb = a32 & 0x88888888; // MSB, 3rd bits
+ a32 ^= a_msb; // clear MSB
+ a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+ r32 ^= (a32) * ((b32 >> 2) & 1);
+
+ a_msb = a32 & 0x88888888; // MSB, 3rd bits
+ a32 ^= a_msb; // clear MSB
+ a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+ r32 ^= (a32) * ((b32 >> 3) & 1);
+
+ return r32;
+
+}
+
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+ for(int i=0; i < m_legs*2;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], a);
+ }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i
+#include
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen) {
+ (void) inputByteLen;
+ uint8_t iv[12] = { 0 };
+ aes128ctr_prf(output, outputByteLen, input, iv);
+ return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/api.c b/src/sig/mayo/pqmayo_mayo-1_opt/api.c
new file mode 100644
index 0000000000..b7e2ef80ce
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_1
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+ return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *sk) {
+ return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *sk) {
+ return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+ const unsigned char *sm, size_t smlen,
+ const unsigned char *pk) {
+ return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *pk) {
+ if (siglen != CRYPTO_BYTES)
+ return -1;
+ return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/api.h b/src/sig/mayo/pqmayo_mayo-1_opt/api.h
new file mode 100644
index 0000000000..86b7bd545d
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include
+
+#define CRYPTO_SECRETKEYBYTES 24
+#define CRYPTO_PUBLICKEYBYTES 1168
+#define CRYPTO_BYTES 321
+
+#define CRYPTO_ALGNAME "MAYO-1"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+ const unsigned char *sm, size_t smlen,
+ const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef ENABLE_CT_TESTING
+#include
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+ mayo12_m_upper(m_legs, in, out, size);
+#else
+ int m_vecs_stored = 0;
+ for (int r = 0; r < size; r++) {
+ for (int c = r; c < size; c++) {
+ m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+ if (r != c) {
+ m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+ }
+ m_vecs_stored ++;
+ }
+ }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ (void) p;
+ mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ (void) p;
+ mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ (void) p;
+ mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+ #ifndef MAYO_VARIANT
+ const int m_legs = PARAM_m(p) / 32;
+ #else
+ (void) p;
+ #endif
+ const int param_o = PARAM_o(p);
+ const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+ int
+ bs_mat_entries_used = 0;
+ for (int r = 0; r < param_v; r++) {
+ for (int c = r; c < param_v; c++) {
+ if(c==r) {
+ bs_mat_entries_used += 1;
+ continue;
+ }
+ for (int k = 0; k < param_o; k += 1) {
+
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+ vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k));
+ vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k));
+ vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k));
+#else
+ m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+ m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+ }
+ bs_mat_entries_used += 1;
+ }
+ }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+ (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int param_o = PARAM_o(p);
+ const int param_k = PARAM_k(p);
+
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+ param_k, param_n - param_o, param_o);
+
+ mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+ mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+ Y, param_k, param_n - param_o,
+ param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+ (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ const int param_v = PARAM_v(p);
+ const int param_o = PARAM_o(p);
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int m_legs = PARAM_m(p) / 32;
+ mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+ mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2]
+// [ 0 P3 ] [S2] [ P3*S2]
+// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+// [ P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+ const int m, const int v, const int o, const int k, uint64_t *SPS) {
+ (void) m;
+#if MAYO_AVX
+ const int n = o + v;
+
+ /* Old approach which is constant time but doesn't have to be */
+ unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+ unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+ unsigned char *s1_write = S1;
+ unsigned char *s2_write = S2;
+
+ for (int r=0; r < k; r++)
+ {
+ for (int c = 0; c < n; c++)
+ {
+ if(c < v){
+ *(s1_write++) = S[r*n + c];
+ } else {
+ *(s2_write++) = S[r*n + c];
+ }
+ }
+ }
+
+ alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+
+ mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+
+ mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+ mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+ mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else
+ NOT IMPLEMENTED
+#endif
+#else
+ const int n = o + v;
+ alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+ mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+ mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+ const unsigned char *y, const unsigned char *r,
+ unsigned char *x, int k, int o, int m, int A_cols) {
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ unsigned char finished;
+ int col_upper_bound;
+ unsigned char correct_column;
+
+ // x <- r
+ for (int i = 0; i < k * o; i++) {
+ x[i] = r[i];
+ }
+
+ // compute Ar;
+ unsigned char Ar[M_MAX];
+ for (int i = 0; i < m; i++) {
+ A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+ }
+ mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+ // move y - Ar to last column of matrix A
+ for (int i = 0; i < m; i++) {
+ A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+ }
+
+ EF(A, m, k * o + 1);
+
+ // check if last row of A (excluding the last entry of y) is zero
+ unsigned char full_rank = 0;
+ for (int i = 0; i < A_cols - 1; i++) {
+ full_rank |= A[(m - 1) * A_cols + i];
+ }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+ if (full_rank == 0) {
+ return 0;
+ }
+
+ // back substitution in constant time
+ // the index of the first nonzero entry in each row is secret, which makes
+ // things less efficient
+
+ for (int row = m - 1; row >= 0; row--) {
+ finished = 0;
+ col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+ // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+ for (int col = row; col <= col_upper_bound; col++) {
+
+ // Compare two chars in constant time.
+ // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+ correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+ unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+ x[col] ^= u;
+
+ for (int i = 0; i < row; i += 8) {
+ uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8)
+ ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+ ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+ ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+ tmp = mul_fx8(u, tmp);
+
+ A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf;
+ A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+ A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+ A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+ A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+ A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+ A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+ A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+ }
+
+ finished = finished | correct_column;
+ }
+ }
+ return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include
+#include
+#include
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+ #include
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+ #include
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+ #include
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+ #include
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+ #include
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+ const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_128.h
new file mode 100644
index 0000000000..418c308e2f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_128.h
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include
+#include
+#include
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+ out[4] = in[4];
+ out[5] = in[5];
+ out[6] = in[6];
+ out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+ acc[4] ^= in[4];
+ acc[5] ^= in[5];
+ acc[6] ^= in[6];
+ acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_msb = 0x8888888888888888ULL;
+ for(int i=0; i < 8;i++){
+ uint64_t t = in[i] & mask_msb;
+ acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_lsb = 0x1111111111111111ULL;
+ for(int i=0; i < 8;i++){
+ uint64_t t = in[i] & mask_lsb;
+ acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+ }
+}
+
+static
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ uint32_t tab = mul_table(a);
+
+ uint64_t lsb_ask = 0x1111111111111111ULL;
+
+ for(int i=0; i < 8;i++){
+ acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff)
+ ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf)
+ ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+ ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+ }
+}
+
+static
+inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8);
+ m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+ m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8);
+ m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8);
+ m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8);
+ m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8);
+ m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8);
+ m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8);
+ m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8);
+ m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8);
+ m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8);
+ m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8);
+ m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8);
+ m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8);
+ vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_64.h
new file mode 100644
index 0000000000..a70b7a3118
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_64.h
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include
+#include
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+ uint64_t mask_msb = 0x8888888888888888ULL;
+ uint64_t a_msb;
+ uint64_t a64 = a;
+ uint64_t b32 = b;
+ uint64_t r64 = a64 * (b32 & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 1) & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 2) & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 3) & 1);
+
+ return r64;
+}
+
+static inline uint32_t mul_table(uint8_t b){
+ uint32_t x = ((uint32_t) b) * 0x08040201;
+
+ uint32_t high_nibble_mask = 0xf0f0f0f0;
+
+ uint32_t high_half = x & high_nibble_mask;
+ return (x ^ (high_half >> 4) ^ (high_half >> 3));
+}
+
+static
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ uint32_t tab = mul_table(a);
+
+ uint64_t lsb_ask = 0x1111111111111111ULL;
+
+ for(int i=0; i < 4;i++){
+ acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff)
+ ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf)
+ ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+ ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+ }
+}
+
+static
+inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+ uint32_t tab = mul_table(a);
+
+ uint64_t lsb_ask = 0x1111111111111111ULL;
+
+ for(int i=0; i < legs; i++){
+ acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff)
+ ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf)
+ ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+ ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_msb = 0x8888888888888888ULL;
+ for(int i=0; i < 4;i++){
+ uint64_t t = in[i] & mask_msb;
+ acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_lsb = 0x1111111111111111ULL;
+ for(int i=0; i < 4;i++){
+ uint64_t t = in[i] & mask_lsb;
+ acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+ }
+}
+
+static
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4);
+ m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+ m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4);
+ m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4);
+ m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4);
+ m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4);
+ m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4);
+ m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4);
+ m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4);
+ m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4);
+ m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4);
+ m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4);
+ m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4);
+ m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4);
+ vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_96.h
new file mode 100644
index 0000000000..a38f89e454
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_96.h
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include
+#include
+#include
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+ out[4] = in[4];
+ out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+ acc[4] ^= in[4];
+ acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_msb = 0x8888888888888888ULL;
+ for(int i=0; i < 6;i++){
+ uint64_t t = in[i] & mask_msb;
+ acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_lsb = 0x1111111111111111ULL;
+ for(int i=0; i < 6;i++){
+ uint64_t t = in[i] & mask_lsb;
+ acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+ }
+}
+
+static
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ uint32_t tab = mul_table(a);
+
+ uint64_t lsb_ask = 0x1111111111111111ULL;
+
+ for(int i=0; i < 6;i++){
+ acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff)
+ ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf)
+ ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+ ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+ }
+}
+
+static
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6);
+ m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+ m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6);
+ m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6);
+ m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6);
+ m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6);
+ m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6);
+ m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6);
+ m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6);
+ m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6);
+ m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6);
+ m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6);
+ m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6);
+ m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6);
+ vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_common.h
new file mode 100644
index 0000000000..d337bc238c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_common.h
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include
+
+#ifndef MAYO_VARIANT
+static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) {
+
+ m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2);
+ m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 3 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 8 * m_legs * 2);
+ m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 6 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2);
+ m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 7 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 8 * m_legs * 2);
+ m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 4 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 9 * m_legs * 2);
+ m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 2 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 8 * m_legs * 2);
+ m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 2 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 8 * m_legs * 2);
+ m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 1 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 4 * m_legs * 2);
+ m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 3 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 4 * m_legs * 2);
+ m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 2 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 4 * m_legs * 2);
+ m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 1 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 2 * m_legs * 2);
+ m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 1 * m_legs * 2);
+
+ m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2);
+ m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2);
+ m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2);
+
+ m_vec_copy(m_legs, bins + 1 * m_legs * 2, out);
+}
+#endif
+
+// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2]
+// [ 0 P3 ] [S2] [ P3*S2]
+static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+ const int m, const int v, const int o, const int k, uint64_t *PS) {
+
+ const int n = o + v;
+#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128)
+ (void)m;
+#else
+ const int m_legs = m / 32;
+#endif
+
+ /* Old approach which is constant time but doesn't have to be
+ unsigned char S1[V_MAX*K_MAX];
+ unsigned char S2[O_MAX*K_MAX];
+ unsigned char *s1_write = S1;
+ unsigned char *s2_write = S2;
+ for (int r=0; r < k; r++)
+ {
+ for (int c = 0; c < n; c++)
+ {
+ if(c < v){
+ *(s1_write++) = S[r*n + c];
+ } else {
+ *(s2_write++) = S[r*n + c];
+ }
+ }
+ }
+
+ mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1
+ mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2
+ mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2.
+ */
+
+ // use more stack efficient version for MAYO_3 and MAYO_5
+ #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78
+ uint64_t accumulator[M_MAX * N_MAX] = {0};
+ int P1_used;
+ int P3_used;
+ for (int col = 0; col < k; col++) {
+ for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) {
+ accumulator[i] = 0;
+ }
+ P1_used = 0;
+ for (int row = 0; row < v; row++) {
+ for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8);
+#else
+ bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+ P1_used ++;
+ }
+
+ for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 );
+#else
+ bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+#endif
+ }
+ }
+
+ P3_used = 0;
+ for (int row = v; row < n; row++) {
+ for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8);
+#else
+ bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+ P3_used ++;
+ }
+ }
+
+ for (int row = 0; row < n; row++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8);
+#else
+ bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2);
+#endif
+ }
+ }
+
+ #else
+
+ alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0};
+ int P1_used = 0;
+ for (int row = 0; row < v; row++) {
+ for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+ for (int col = 0; col < k; col++) {
+ m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+ }
+#endif
+ P1_used ++;
+ }
+
+
+ for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 );
+#else
+ for (int col = 0; col < k; col++) {
+ m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+ }
+#endif
+ }
+ }
+
+ int P3_used = 0;
+ for (int row = v; row < n; row++) {
+ for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+ for (int col = 0; col < k; col++) {
+ m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+ }
+#endif
+ P3_used ++;
+ }
+ }
+
+ // multiply stuff according to the bins of the accumulator and add to PS.
+ int i = 0;
+ while (i < n * k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4);
+ i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6);
+ i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8);
+ i++;
+#else
+ m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2);
+ i++;
+#endif
+ }
+
+ #endif
+}
+
+
+static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int n, uint64_t *SPS){
+ alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0};
+ #if !defined(MAYO_VARIANT)
+ const int m_legs = m/32;
+ #else
+ (void) m;
+ #endif
+ for (int row = 0; row < k; row++) {
+ for (int j = 0; j < n; j++) {
+ for (int col = 0; col < k; col += 1) {
+ #if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 );
+ #elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 );
+ #elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 );
+ #else
+ m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 );
+ #endif
+ }
+ }
+ }
+
+ // multiply stuff according to the bins of the accumulator and add to PS.
+ int i = 0;
+ while (i < k*k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4);
+ i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6);
+ i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8);
+ i++;
+#else
+ m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2);
+ i++;
+#endif
+ }
+}
+
+
+// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) {
+
+ int bs_mat_entries_used = 0;
+ for (int r = 0; r < bs_mat_rows; r++) {
+ for (int c = triangular * r; c < bs_mat_cols; c++) {
+ for (int k = 0; k < mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ (void) m_legs;
+ vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ (void) m_legs;
+ vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ (void) m_legs;
+ vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k));
+#else
+ m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k));
+#endif
+ }
+ bs_mat_entries_used += 1;
+ }
+ }
+}
+
+// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) {
+
+ int bs_mat_entries_used = 0;
+ for (int r = 0; r < bs_mat_rows; r++) {
+ for (int c = triangular * r; c < bs_mat_cols; c++) {
+ for (int k = 0; k < mat_rows; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ (void) m_legs;
+ vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ (void) m_legs;
+ vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ (void) m_legs;
+ vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k));
+#else
+ m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k));
+#endif
+ }
+ bs_mat_entries_used += 1;
+ }
+ }
+}
+
+// multiplies the transpose of a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+ for (int r = 0; r < mat_cols; r++) {
+ for (int c = 0; c < mat_rows; c++) {
+ for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ (void) m_legs;
+ vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ (void) m_legs;
+ vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ (void) m_legs;
+ vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k));
+#else
+ m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+ }
+ }
+ }
+}
+
+// multiplies a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+ for (int r = 0; r < mat_rows; r++) {
+ for (int c = 0; c < mat_cols; c++) {
+ for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ (void) m_legs;
+ vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ (void) m_legs;
+ vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ (void) m_legs;
+ vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k));
+#else
+ m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+ }
+ }
+ }
+}
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo-1_opt/echelon_form.h
new file mode 100644
index 0000000000..82505847c9
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/echelon_form.h
@@ -0,0 +1,152 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ECHELON_FORM_H
+#define ECHELON_FORM_H
+
+#include
+#include
+#include
+#include
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+static inline unsigned char
+m_extract_element(const uint64_t *in, int index) {
+ const int leg = index / 16;
+ const int offset = index % 16;
+
+ return (in[leg] >> (offset*4)) & 0xF;
+}
+
+static inline void
+ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) {
+ int i;
+ unsigned char *out8 = (unsigned char *)out;
+ for(i = 0; i+1 < ncols; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+ out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0) | (in[i+1] << 4);
+#else
+ out8[i/2] = (in[i+0] << 0) | (in[i+1] << 4);
+#endif
+ }
+ if (ncols % 2 == 1){
+#ifdef TARGET_BIG_ENDIAN
+ out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0);
+#else
+ out8[i/2] = (in[i+0] << 0);
+#endif
+ }
+}
+
+static inline void
+ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) {
+ const unsigned char *in8 = (const unsigned char *)in;
+ for(int i = 0; i < legs * 16; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+ out[i] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF;
+ out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+ out[i] = (in8[i/2]) & 0xF;
+ out[i+1] = (in8[i/2] >> 4);
+#endif
+ }
+}
+
+
+// put matrix in row echelon form with ones on first nonzero entries *in
+// constant time*
+static inline void EF(unsigned char *A, int nrows, int ncols) {
+
+ alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16];
+ alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16];
+ alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 };
+
+ int row_len = (ncols + 15) / 16;
+
+ // nibbleslice the matrix A
+ for (int i = 0; i < nrows; i++) {
+ ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols);
+ }
+
+ // pivot row is secret, pivot col is not
+
+ unsigned char inverse;
+ int pivot_row = 0;
+ for (int pivot_col = 0; pivot_col < ncols; pivot_col++) {
+
+ int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols);
+ int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col);
+ // the pivot row is guaranteed to be between these lower and upper bounds if
+ // A has full rank
+
+ // zero out pivot row
+ for (int i = 0; i < row_len; i++) {
+ _pivot_row[i] = 0;
+ _pivot_row2[i] = 0;
+ }
+
+ // try to get a pivot row in constant time
+ unsigned char pivot = 0;
+ uint64_t pivot_is_zero = -1;
+ for (int row = pivot_row_lower_bound;
+ row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+
+ uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row);
+ uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row);
+
+ for (int j = 0; j < row_len; j++) {
+ _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) &
+ packed_A[row * row_len + j];
+ }
+ pivot = m_extract_element(_pivot_row, pivot_col);
+ pivot_is_zero = ~ct_compare_64((int) pivot, 0);
+ }
+
+ // multiply pivot row by inverse of pivot
+ inverse = inverse_f(pivot);
+ vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2);
+
+ // conditionally write pivot row to the correct row, if there is a nonzero
+ // pivot
+ for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) {
+ uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero;
+ uint64_t do_not_copy = ~do_copy;
+ for (int col = 0; col < row_len; col++) {
+ packed_A[row * row_len + col] =
+ (do_not_copy & packed_A[row * row_len + col]) +
+ (do_copy & _pivot_row2[col]);
+ }
+ }
+
+ // eliminate entries below pivot
+ for (int row = pivot_row_lower_bound; row < nrows; row++) {
+ unsigned char below_pivot = (row > pivot_row);
+ unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col);
+
+ vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim,
+ packed_A + row * row_len);
+ }
+
+ pivot_row += (-(int64_t)(~pivot_is_zero));
+ }
+
+ unsigned char temp[(O_MAX * K_MAX + 1 + 15)];
+
+ // unbitslice the matrix A
+ for (int i = 0; i < nrows; i++) {
+ ef_unpack_m_vec(row_len, packed_A + i * row_len, temp);
+ for (int j = 0; j < ncols; j++) {
+ A[i * ncols + j] = temp[j];
+ }
+ }
+
+ mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15);
+ mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+ mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+ mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/mayo.c b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef ENABLE_CT_TESTING
+#include
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+ int i;
+ for (i = 0; i < mdeclen / 2; ++i) {
+ *mdec++ = m[i] & 0xf;
+ *mdec++ = m[i] >> 4;
+ }
+
+ if (mdeclen % 2 == 1) {
+ *mdec++ = m[i] & 0x0f;
+ }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+ int i;
+ for (i = 0; i < mlen / 2; ++i, m += 2) {
+ menc[i] = (*m) | (*(m + 1) << 4);
+ }
+
+ if (mlen % 2 == 1) {
+ menc[i] = (*m);
+ }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+ #ifndef ENABLE_PARAMS_DYNAMIC
+ (void) p;
+ #endif
+
+ const uint64_t *vPv = _vPv;
+ uint64_t temp[M_MAX/16] = {0};
+ unsigned char *temp_bytes = (unsigned char *) temp;
+ int k = 0;
+ for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+ for (int j = i; j < PARAM_k(p); j++) {
+ // multiply by X (shift up 4 bits)
+ unsigned char top = temp[k] >> 60;
+ temp[k] <<= 4;
+ k--;
+ for(; k>=0; k--){
+ temp[k+1] ^= temp[k] >> 60;
+ temp[k] <<= 4;
+ }
+ // reduce mod f(X)
+ for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+ if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+ temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+ temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+ }
+ else {
+#ifdef TARGET_BIG_ENDIAN
+ temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+ temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+ }
+ }
+
+ // extract from vPv and add
+ for(k=0; k < PARAM_m(p)/16; k ++){
+ temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+ }
+ k--;
+ }
+ }
+
+ // add to y
+ for (int i = 0; i < PARAM_m(p); i+=2)
+ {
+#ifdef TARGET_BIG_ENDIAN
+ y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+ y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+ y[i] = t[i] ^ (temp_bytes[i/2] & 0xF);
+ y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+ }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+ static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+ static const uint64_t even_bytes = 0x00ff00ff00ff00ff;
+ static const uint64_t even_2bytes = 0x0000ffff0000ffff;
+ static const uint64_t even_half = 0x00000000ffffffff;
+
+ for (size_t i = 0; i < 16; i+=2)
+ {
+ uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles;
+ M[i ] ^= t << 4;
+ M[i+1] ^= t;
+ }
+
+ for (size_t i = 0; i < 16; i+=4)
+ {
+ uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes;
+ uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes;
+ M[i ] ^= (t0 << 8);
+ M[i+1] ^= (t1 << 8);
+ M[i+2] ^= t0;
+ M[i+3] ^= t1;
+ }
+
+ for (size_t i = 0; i < 4; i++)
+ {
+ uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes;
+ uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+ M[i ] ^= t0 << 16;
+ M[i+ 8] ^= t1 << 16;
+ M[i+ 4] ^= t0;
+ M[i+12] ^= t1;
+ }
+
+ for (size_t i = 0; i < 8; i++)
+ {
+ uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half;
+ M[i ] ^= t << 32;
+ M[i+8] ^= t;
+ }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+ #ifndef ENABLE_PARAMS_DYNAMIC
+ (void) p;
+ #endif
+
+ const uint64_t *VtL = _VtL;
+ int bits_to_shift = 0;
+ int words_to_shift = 0;
+ uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+ size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+ const uint64_t *Mi, *Mj;
+
+ for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+ for (int j = PARAM_k(p) - 1; j >= i; --j) {
+ // add the M_i and M_j to A, shifted "down" by l positions
+ Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+ for (int c = 0; c < PARAM_o(p); c++) {
+ for (int k = 0; k < PARAM_m(p)/16; k++)
+ {
+ A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+ if(bits_to_shift > 0){
+ A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+ }
+ }
+ }
+
+ if (i != j) {
+ Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+ for (int c = 0; c < PARAM_o(p); c++) {
+ for (int k = 0; k < PARAM_m(p)/16; k++)
+ {
+ A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+ if(bits_to_shift > 0){
+ A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+ }
+ }
+ }
+ }
+
+ bits_to_shift += 4;
+ if(bits_to_shift == 64){
+ words_to_shift ++;
+ bits_to_shift = 0;
+ }
+ }
+ }
+
+ for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+ {
+ transpose_16x16_nibbles(A + c);
+ }
+
+ unsigned char tab[F_TAIL_LEN*4] = {0};
+ for (size_t i = 0; i < F_TAIL_LEN; i++)
+ {
+ tab[4*i] = mul_f(PARAM_f_tail(p)[i],1);
+ tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+ tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+ tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+ }
+
+ uint64_t low_bit_in_nibble = 0x1111111111111111;
+
+ for (size_t c = 0; c < A_width; c+= 16)
+ {
+ for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+ {
+ size_t pos = (r/16)*A_width + c + (r%16);
+ uint64_t t0 = A[pos] & low_bit_in_nibble;
+ uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+ uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+ uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+ for (size_t t = 0; t < F_TAIL_LEN; t++)
+ {
+ A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+ }
+ }
+ }
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i)
+ A[i] = BSWAP64(A[i]);
+#endif
+
+ for (int r = 0; r < PARAM_m(p); r+=16)
+ {
+ for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+ {
+ for (size_t i = 0; i < 16; i++)
+ {
+ decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+ }
+ }
+ }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+ int ret = 0;
+
+ ret = mayo_keypair_compact(p, pk, sk);
+ if (ret != MAYO_OK) {
+ goto err;
+ }
+
+err:
+ return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *csk) {
+ int ret = MAYO_OK;
+ unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+ unsigned char y[M_MAX]; // secret data
+ unsigned char salt[SALT_BYTES_MAX]; // not secret data
+ unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+ Vdec[N_MINUS_O_MAX * K_MAX]; // secret data
+ unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data
+ unsigned char x[K_MAX * N_MAX]; // not secret data
+ unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data
+ unsigned char s[K_MAX * N_MAX]; // not secret data
+ const unsigned char *seed_sk;
+ unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+ alignas(32) sk_t sk; // secret data
+ unsigned char Ox[N_MINUS_O_MAX]; // secret data
+ // unsigned char Mdigest[DIGEST_BYTES];
+ unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+ unsigned char *ctrbyte;
+ unsigned char *vi;
+
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int param_o = PARAM_o(p);
+ const int param_k = PARAM_k(p);
+ const int param_m_bytes = PARAM_m_bytes(p);
+ const int param_v_bytes = PARAM_v_bytes(p);
+ const int param_r_bytes = PARAM_r_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ const int param_A_cols = PARAM_A_cols(p);
+ const int param_digest_bytes = PARAM_digest_bytes(p);
+ const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+ const int param_salt_bytes = PARAM_salt_bytes(p);
+
+ ret = mayo_expand_sk(p, csk, &sk);
+ if (ret != MAYO_OK) {
+ goto err;
+ }
+
+ seed_sk = csk;
+ decode(sk.o, O, (param_n - param_o) * param_o);
+
+ // hash message
+ shake256(tmp, param_digest_bytes, m, mlen);
+
+ uint64_t *P1 = sk.p;
+ uint64_t *L = P1 + (param_P1_bytes/8);
+ alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < param_P1_bytes / 8; ++i) {
+ P1[i] = BSWAP64(P1[i]);
+ }
+ for (int i = 0; i < param_P2_bytes / 8; ++i) {
+ L[i] = BSWAP64(L[i]);
+ }
+#endif
+
+ // choose the randomizer
+ #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+ randombytes(tmp + param_digest_bytes, param_salt_bytes);
+ #else
+ if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+ ret = MAYO_ERR;
+ goto err;
+ }
+ #endif
+
+ // hashing to salt
+ memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+ param_sk_seed_bytes);
+ shake256(salt, param_salt_bytes, tmp,
+ param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+ // hashing to t
+ memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+ ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+ shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+ decode(tenc, t, param_m); // may not be necessary
+
+ for (int ctr = 0; ctr <= 255; ++ctr) {
+ *ctrbyte = (unsigned char)ctr;
+
+ shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+ param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+ // decode the v_i vectors
+ for (int i = 0; i <= param_k - 1; ++i) {
+ decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+ param_n - param_o);
+ }
+
+ // compute all the V * L matrices.
+ // compute all the V * P1 * V^T matrices.
+ alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+ V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+ compute_rhs(p, Y, t, y);
+ compute_A(p, Mtmp, A);
+
+ decode(V + param_k * param_v_bytes, r,
+ param_k *
+ param_o);
+ if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+ break;
+ } else {
+ memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+ }
+ }
+
+ // s is already 0
+ // TODO: optimize this?
+ for (int i = 0; i <= param_k - 1; ++i) {
+ vi = Vdec + i * (param_n - param_o);
+ mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+ mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+ memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+ }
+ encode(s, sig, param_n * param_k);
+ memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+ *siglen = param_sig_bytes;
+err:
+ mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+ mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+ mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+ mayo_secure_clear(r, K_MAX * O_MAX + 1);
+ mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+ mayo_secure_clear(&sk, sizeof(sk_t));
+ mayo_secure_clear(Ox, N_MINUS_O_MAX);
+ mayo_secure_clear(tmp,
+ DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+ return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+ size_t *smlen, const unsigned char *m,
+ size_t mlen, const unsigned char *csk) {
+ int ret = MAYO_OK;
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ size_t siglen = param_sig_bytes;
+ ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+ if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+ goto err;
+
+ memmove(sm + param_sig_bytes, m, mlen);
+ *smlen = siglen + mlen;
+err:
+ return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+ size_t *mlen, const unsigned char *sm,
+ size_t smlen, const unsigned char *pk) {
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ if (smlen < (size_t)param_sig_bytes) {
+ return MAYO_ERR;
+ }
+ int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+ pk);
+
+ if (result == MAYO_OK) {
+ *mlen = smlen - param_sig_bytes;
+ memmove(m, sm + param_sig_bytes, *mlen);
+ }
+
+ return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+ unsigned char *csk) {
+ int ret = MAYO_OK;
+ unsigned char *seed_sk = csk;
+ unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+ alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+ alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+ unsigned char *seed_pk;
+ unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+ const int param_m = PARAM_m(p);
+ const int param_v = PARAM_v(p);
+ const int param_o = PARAM_o(p);
+ const int param_O_bytes = PARAM_O_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+ const int param_P3_bytes = PARAM_P3_bytes(p);
+ const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+ const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+ // seed_sk $←- B^(sk_seed bytes)
+ #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+ randombytes(seed_sk, param_sk_seed_bytes);
+ #else
+ if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+ ret = MAYO_ERR;
+ goto err;
+ }
+ #endif
+
+ // S ← shake256(seedsk, pk seed bytes + O bytes)
+ shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+ param_sk_seed_bytes);
+ // seed_pk ← s[0 : pk_seed_bytes]
+ seed_pk = S;
+
+ // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+ decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+ // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+ PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+ param_pk_seed_bytes);
+
+
+ int m_legs = param_m / 32;
+
+ uint64_t *P1 = P;
+ uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+ // compute P3 = O^t * (P1*O + P2)
+ Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+ // store seed_pk in cpk
+ memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+ alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+ // compute Upper(P3) and store in cpk
+ m_upper(m_legs, P3, P3_upper, param_o);
+
+ memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+ mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+ mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+ return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+ unsigned char *pk) {
+ #ifdef MAYO_VARIANT
+ (void)p;
+ #endif
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+ const int param_P3_bytes = PARAM_P3_bytes(p);
+ const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+ PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+ pk += param_P1_bytes + param_P2_bytes;
+ memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+ return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+ sk_t *sk) {
+ int ret = MAYO_OK;
+ unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+ uint64_t *P = sk->p;
+ unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+ const int param_o = PARAM_o(p);
+ const int param_v = PARAM_v(p);
+ const int param_O_bytes = PARAM_O_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+ const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+ const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+ const unsigned char *seed_sk = csk;
+ unsigned char *seed_pk = S;
+
+ shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+ param_sk_seed_bytes);
+ decode(S + param_pk_seed_bytes, O,
+ param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+ // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+ PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+ param_pk_seed_bytes);
+
+ uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+ P[i] = BSWAP64(P[i]);
+ }
+#endif
+
+ uint64_t *P1 = P;
+ // compute L_i = (P1 + P1^t)*O + P2
+ uint64_t *L = P2;
+ P1P1t_times_O(p, P1, O, L);
+
+ // write to sk
+ memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+ P[i] = BSWAP64(P[i]);
+ }
+#endif
+
+ mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+ mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+ return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+ size_t mlen, const unsigned char *sig,
+ const unsigned char *cpk) {
+ unsigned char tEnc[M_BYTES_MAX];
+ unsigned char t[M_MAX];
+ unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+ unsigned char s[K_MAX * N_MAX];
+ alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+ unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int param_v = PARAM_v(p);
+ const int param_o = PARAM_o(p);
+ const int param_k = PARAM_k(p);
+ const int param_m_bytes = PARAM_m_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+ const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ const int param_digest_bytes = PARAM_digest_bytes(p);
+ const int param_salt_bytes = PARAM_salt_bytes(p);
+
+ int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+ if (ret != MAYO_OK) {
+ return MAYO_ERR;
+ }
+
+ uint64_t *P1 = pk;
+ uint64_t *P2 = pk + (param_P1_bytes / 8);
+ uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < param_P1_bytes / 8; ++i) {
+ P1[i] = BSWAP64(P1[i]);
+ }
+ for (int i = 0; i < param_P2_bytes / 8; ++i) {
+ P2[i] = BSWAP64(P2[i]);
+ }
+ for (int i = 0; i < param_P3_bytes / 8; ++i) {
+ P3[i] = BSWAP64(P3[i]);
+ }
+#endif
+
+ // hash m
+ shake256(tmp, param_digest_bytes, m, mlen);
+
+ // compute t
+ memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+ param_salt_bytes);
+ shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+ decode(tEnc, t, param_m);
+
+ // decode s
+ decode(sig, s, param_k * param_n);
+
+ // Compute S*P*S^T
+ alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+ m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+ param_v, param_o, param_k, SPS);
+
+ // combine the vectors in SPS and reduce mod f(X)
+ compute_rhs(p, SPS, y, y);
+
+ if (memcmp(y, t, param_m) == 0) {
+ return MAYO_OK; // good signature
+ }
+ return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/mayo.h b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include
+#include
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64 \
+ { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3
+#define F_TAIL_96 \
+ { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x
+#define F_TAIL_128 \
+ { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+ int m;
+ int n;
+ int o;
+ int k;
+ int q;
+ const unsigned char *f_tail;
+ int m_bytes;
+ int O_bytes;
+ int v_bytes;
+ int r_bytes;
+ int R_bytes;
+ int P1_bytes;
+ int P2_bytes;
+ int P3_bytes;
+ int csk_bytes;
+ int esk_bytes;
+ int cpk_bytes;
+ int epk_bytes;
+ int sig_bytes;
+ int salt_bytes;
+ int sk_seed_bytes;
+ int digest_bytes;
+ int pk_seed_bytes;
+ const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+ uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+ uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+ size_t *smlen, const unsigned char *m,
+ size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+ size_t *mlen, const unsigned char *sm,
+ size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+ unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+ unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+ sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+ size_t mlen, const unsigned char *sig,
+ const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/mem.h b/src/sig/mayo/pqmayo_mayo-1_opt/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include
+#include
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+ int32_t diff = b - a;
+ return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+ int64_t diff = ((int64_t) b) - ((int64_t) a);
+ return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+ return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+ return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+ return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include
+/**
+ * Clears and frees allocated memory.
+ *
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+ OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ *
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+ OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/params.c b/src/sig/mayo/pqmayo_mayo-1_opt/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+ const mayo_params_t nm = { \
+ .m = PARAM_JOIN2(nm, m), \
+ .n = PARAM_JOIN2(nm, n), \
+ .o = PARAM_JOIN2(nm, o), \
+ .k = PARAM_JOIN2(nm, k), \
+ .q = PARAM_JOIN2(nm, q), \
+ .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+ .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+ .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+ .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+ .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+ .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+ .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+ .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+ .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+ .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+ .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+ .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+ .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+ .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+ .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+ .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+ .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+ .name = #nm \
+ };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_opt/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+ // carryless multiply
+ unsigned char p;
+ p = (a & 1)*b;
+ p ^= (a & 2)*b;
+ p ^= (a & 4)*b;
+ p ^= (a & 8)*b;
+
+ // reduce mod x^4 + x + 1
+ unsigned char top_p = p & 0xf0;
+ unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+ return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+ // carryless multiply
+ uint64_t p;
+ p = (a & 1)*b;
+ p ^= (a & 2)*b;
+ p ^= (a & 4)*b;
+ p ^= (a & 8)*b;
+
+ // reduce mod x^4 + x + 1
+ uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+ uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+ return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+ return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+ return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+ return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+ // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+ // 10, 4, 3, 8}; return table[a & 15];
+
+ unsigned char a2 = mul_f(a, a);
+ unsigned char a4 = mul_f(a2, a2);
+ unsigned char a8 = mul_f(a4, a4);
+ unsigned char a6 = mul_f(a2, a4);
+ unsigned char a14 = mul_f(a8, a6);
+
+ return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+ const unsigned char *b, int n, int m) {
+ unsigned char ret = 0;
+ for (int i = 0; i < n; ++i, b += m) {
+ ret = add_f(mul_f(a[i], *b), ret);
+ }
+ return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+ unsigned char *c, int colrow_ab, int row_a, int col_b) {
+ for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+ for (int j = 0; j < col_b; ++j, ++c) {
+ *c = lincomb(a, b + j, colrow_ab, col_b);
+ }
+ }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+ unsigned char *c, int m, int n) {
+ for (int i = 0; i < m; ++i) {
+ for (int j = 0; j < n; ++j) {
+ *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+ }
+ }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+ uint64_t *out) {
+ for (int i = 0; i < m_legs * 2; i++) {
+ out[i] = in[i];
+ }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+ uint64_t *acc) {
+ for (int i = 0; i < m_legs * 2; i++) {
+ acc[i] ^= in[i];
+ }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+ uint32_t a_msb;
+ uint32_t a32 = a;
+ uint32_t b32 = b;
+ uint32_t r32 = a32 * (b32 & 1);
+
+ a_msb = a32 & 0x88888888; // MSB, 3rd bits
+ a32 ^= a_msb; // clear MSB
+ a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+ r32 ^= (a32) * ((b32 >> 1) & 1);
+
+ a_msb = a32 & 0x88888888; // MSB, 3rd bits
+ a32 ^= a_msb; // clear MSB
+ a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+ r32 ^= (a32) * ((b32 >> 2) & 1);
+
+ a_msb = a32 & 0x88888888; // MSB, 3rd bits
+ a32 ^= a_msb; // clear MSB
+ a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+ r32 ^= (a32) * ((b32 >> 3) & 1);
+
+ return r32;
+
+}
+
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+ for(int i=0; i < m_legs*2;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], a);
+ }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i
+#include
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen) {
+ (void) inputByteLen;
+ uint8_t iv[12] = { 0 };
+ aes128ctr_prf(output, outputByteLen, input, iv);
+ return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/api.c b/src/sig/mayo/pqmayo_mayo-2_avx2/api.c
new file mode 100644
index 0000000000..a7cf85eedf
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_2
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+ return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *sk) {
+ return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *sk) {
+ return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+ const unsigned char *sm, size_t smlen,
+ const unsigned char *pk) {
+ return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *pk) {
+ if (siglen != CRYPTO_BYTES)
+ return -1;
+ return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/api.h b/src/sig/mayo/pqmayo_mayo-2_avx2/api.h
new file mode 100644
index 0000000000..265a5639db
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include
+
+#define CRYPTO_SECRETKEYBYTES 24
+#define CRYPTO_PUBLICKEYBYTES 5488
+#define CRYPTO_BYTES 180
+
+#define CRYPTO_ALGNAME "MAYO-2"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+ const unsigned char *sm, size_t smlen,
+ const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef ENABLE_CT_TESTING
+#include
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+ mayo12_m_upper(m_legs, in, out, size);
+#else
+ int m_vecs_stored = 0;
+ for (int r = 0; r < size; r++) {
+ for (int c = r; c < size; c++) {
+ m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+ if (r != c) {
+ m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+ }
+ m_vecs_stored ++;
+ }
+ }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ (void) p;
+ mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ (void) p;
+ mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ (void) p;
+ mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+ #ifndef MAYO_VARIANT
+ const int m_legs = PARAM_m(p) / 32;
+ #else
+ (void) p;
+ #endif
+ const int param_o = PARAM_o(p);
+ const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+ int
+ bs_mat_entries_used = 0;
+ for (int r = 0; r < param_v; r++) {
+ for (int c = r; c < param_v; c++) {
+ if(c==r) {
+ bs_mat_entries_used += 1;
+ continue;
+ }
+ for (int k = 0; k < param_o; k += 1) {
+
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+ vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k));
+ vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k));
+ vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k));
+#else
+ m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+ m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+ }
+ bs_mat_entries_used += 1;
+ }
+ }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+ (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int param_o = PARAM_o(p);
+ const int param_k = PARAM_k(p);
+
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+ param_k, param_n - param_o, param_o);
+
+ mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+ mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+ Y, param_k, param_n - param_o,
+ param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+ (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ const int param_v = PARAM_v(p);
+ const int param_o = PARAM_o(p);
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int m_legs = PARAM_m(p) / 32;
+ mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+ mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2]
+// [ 0 P3 ] [S2] [ P3*S2]
+// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+// [ P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+ const int m, const int v, const int o, const int k, uint64_t *SPS) {
+ (void) m;
+#if MAYO_AVX
+ const int n = o + v;
+
+ /* Old approach which is constant time but doesn't have to be */
+ unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+ unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+ unsigned char *s1_write = S1;
+ unsigned char *s2_write = S2;
+
+ for (int r=0; r < k; r++)
+ {
+ for (int c = 0; c < n; c++)
+ {
+ if(c < v){
+ *(s1_write++) = S[r*n + c];
+ } else {
+ *(s2_write++) = S[r*n + c];
+ }
+ }
+ }
+
+ alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+
+ mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+
+ mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+ mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+ mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else
+ NOT IMPLEMENTED
+#endif
+#else
+ const int n = o + v;
+ alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+ mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+ mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+ const unsigned char *y, const unsigned char *r,
+ unsigned char *x, int k, int o, int m, int A_cols) {
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ unsigned char finished;
+ int col_upper_bound;
+ unsigned char correct_column;
+
+ // x <- r
+ for (int i = 0; i < k * o; i++) {
+ x[i] = r[i];
+ }
+
+ // compute Ar;
+ unsigned char Ar[M_MAX];
+ for (int i = 0; i < m; i++) {
+ A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+ }
+ mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+ // move y - Ar to last column of matrix A
+ for (int i = 0; i < m; i++) {
+ A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+ }
+
+ EF(A, m, k * o + 1);
+
+ // check if last row of A (excluding the last entry of y) is zero
+ unsigned char full_rank = 0;
+ for (int i = 0; i < A_cols - 1; i++) {
+ full_rank |= A[(m - 1) * A_cols + i];
+ }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+ if (full_rank == 0) {
+ return 0;
+ }
+
+ // back substitution in constant time
+ // the index of the first nonzero entry in each row is secret, which makes
+ // things less efficient
+
+ for (int row = m - 1; row >= 0; row--) {
+ finished = 0;
+ col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+ // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+ for (int col = row; col <= col_upper_bound; col++) {
+
+ // Compare two chars in constant time.
+ // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+ correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+ unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+ x[col] ^= u;
+
+ for (int i = 0; i < row; i += 8) {
+ uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8)
+ ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+ ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+ ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+ tmp = mul_fx8(u, tmp);
+
+ A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf;
+ A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+ A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+ A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+ A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+ A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+ A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+ A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+ }
+
+ finished = finished | correct_column;
+ }
+ }
+ return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include
+#include
+#include
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+ #include
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+ #include
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+ #include
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+ #include
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+ #include
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+ const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_128.h
new file mode 100644
index 0000000000..27b367e940
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_128.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include
+#include
+#include
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+ out[4] = in[4];
+ out[5] = in[5];
+ out[6] = in[6];
+ out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+ acc[4] ^= in[4];
+ acc[5] ^= in[5];
+ acc[6] ^= in[6];
+ acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<8;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+ }
+}
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<8;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+ }
+}
+
+static
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ for(int i=0; i < 8;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], a);
+ }
+}
+
+static
+ inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8);
+ m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+ m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8);
+ m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8);
+ m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8);
+ m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8);
+ m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8);
+ m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8);
+ m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8);
+ m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8);
+ m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8);
+ m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8);
+ m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8);
+ m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8);
+ vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_64.h
new file mode 100644
index 0000000000..9f7535c878
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_64.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include
+#include
+#include
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+}
+
+static
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ for(int i=0; i < 4;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], a);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<4;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+ }
+}
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<4;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+ }
+}
+
+static
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4);
+ m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+ m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4);
+ m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4);
+ m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4);
+ m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4);
+ m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4);
+ m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4);
+ m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4);
+ m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4);
+ m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4);
+ m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4);
+ m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4);
+ m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4);
+ vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_96.h
new file mode 100644
index 0000000000..86359679fb
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_96.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include
+#include
+#include
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+ out[4] = in[4];
+ out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+ acc[4] ^= in[4];
+ acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<6;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i<6;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+ }
+}
+
+static
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ for(int i=0; i < 6;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], a);
+ }
+}
+
+static
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6);
+ m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+ m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6);
+ m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6);
+ m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6);
+ m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6);
+ m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6);
+ m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6);
+ m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6);
+ m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6);
+ m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6);
+ m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6);
+ m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6);
+ m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6);
+ vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_common.h
new file mode 100644
index 0000000000..eeb13dc0bd
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_common.h
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include
+#include
+
+#define K_OVER_2 ((K_MAX+1)/2)
+
+static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = {
+ 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+ 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d,
+ 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09,
+ 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01
+};
+
+//
+// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper!
+//
+static inline __m256i tbl32_gf16_multab2( uint8_t b ) {
+
+ __m256i bx = _mm256_set1_epi16( b & 0xf );
+ __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+ const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+ const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+ const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+ const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+ __m256i mask_1 = _mm256_set1_epi16(1);
+ __m256i mask_4 = _mm256_set1_epi16(4);
+ __m256i mask_0 = _mm256_setzero_si256();
+
+ return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+ ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+ ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+ ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) {
+ return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f);
+}
+
+static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) {
+ __m256i multab_l = tbl32_gf16_multab2( b );
+ __m256i multab_h = _mm256_slli_epi16( multab_l, 4 );
+
+ return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) );
+}
+
+static
+inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){
+ // build multiplication tables
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ for (size_t c = 0; c < O_MAX; c+=2)
+ {
+ O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4);
+ }
+ }
+}
+
+
+static
+inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){
+ // build multiplication tables
+ size_t r;
+ for (size_t c = 0; c < V_MAX; c++)
+ {
+ for (r = 0; r+1 < K_MAX; r+= 2)
+ {
+ V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4);
+ }
+#if K_MAX % 2 == 1
+ V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]);
+#endif
+ }
+}
+
+static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = {
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+ 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+ 0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+ 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d,
+ 0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d,
+ 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02,
+ 0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02,
+ 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09,
+ 0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09,
+ 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06,
+ 0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06,
+ 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04,
+ 0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04,
+ 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b,
+ 0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b,
+ 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01,
+ 0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01,
+ 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e,
+ 0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e,
+ 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c,
+ 0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c,
+ 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03,
+ 0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03,
+ 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08,
+ 0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08,
+ 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07,
+ 0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07,
+ 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05,
+ 0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05,
+ 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a,
+ 0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a};
+
+
+static
+inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) {
+ size_t r;
+ for (size_t c = 0; c < V_MAX; c++)
+ {
+ for (r = 0; r+1 < K_MAX; r+= 2)
+ {
+ S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c]))
+ ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4);
+ }
+#if K_MAX % 2 == 1
+ S1_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c]));
+#endif
+ }
+}
+
+static
+inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) {
+ // build multiplication tables
+ size_t r;
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ for (r = 0; r+1 < K_MAX; r+= 2)
+ {
+ S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c]))
+ ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4);
+ }
+#if K_MAX % 2 == 1
+ S2_multabs[K_OVER_2*c + r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ;
+#endif
+ }
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+ uint64_t mask_msb = 0x8888888888888888ULL;
+ uint64_t a_msb;
+ uint64_t a64 = a;
+ uint64_t b32 = b;
+ uint64_t r64 = a64 * (b32 & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 1) & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 2) & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 3) & 1);
+
+ return r64;
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form.h
new file mode 100644
index 0000000000..fa69de0ab2
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+
+//
+// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/.
+//
+static inline __m256i tbl32_gf16_multab( uint8_t b ) {
+ __m256i bx = _mm256_set1_epi16( b & 0xf );
+ __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+ const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+ const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+ const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+ const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+ __m256i mask_1 = _mm256_set1_epi16(1);
+ __m256i mask_4 = _mm256_set1_epi16(4);
+ __m256i mask_0 = _mm256_setzero_si256();
+
+ return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+ ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+ ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+ ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+/* put matrix in row echelon form with ones on first nonzero entries in constant time*/
+static inline void EF(unsigned char *A, int _nrows, int _ncols) {
+
+ (void) _nrows;
+ (void) _ncols;
+
+ #define nrows M_MAX
+ #define ncols (K_MAX * O_MAX + 1)
+
+ #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32)
+ #define MAX_COLS (AVX_REGS_PER_ROW * 32)
+
+ __m256i _pivot_row[AVX_REGS_PER_ROW];
+ __m256i A_avx[AVX_REGS_PER_ROW* M_MAX];
+
+ unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row;
+ unsigned char* A_bytes = (unsigned char*) A_avx;
+
+ // load A in the tail of AVX2 registers
+ for (int i = 0; i < nrows; i++) {
+ for (int j = 0; j < ncols; j++)
+ {
+ A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ];
+ }
+ }
+
+ // pivot row is secret, pivot col is not
+ unsigned char inverse;
+ int pivot_row = 0;
+ int pivot_col = MAYO_MAX(MAX_COLS - ncols,0);
+ for (; pivot_col < MAX_COLS-128; pivot_col++) {
+ #include "echelon_form_loop.h"
+ }
+ for (; pivot_col < MAX_COLS-96; pivot_col++) {
+ #include "echelon_form_loop.h"
+ }
+ for (; pivot_col < MAX_COLS-64; pivot_col++) {
+ #include "echelon_form_loop.h"
+ }
+ for (; pivot_col < MAX_COLS-32; pivot_col++) {
+ #include "echelon_form_loop.h"
+ }
+ for (; pivot_col < MAX_COLS; pivot_col++) {
+ #include "echelon_form_loop.h"
+ }
+
+ // write the matrix A back
+ for (int i = 0; i < nrows; i++) {
+ for (int j = 0; j < ncols; j++) {
+ A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j];
+ }
+ }
+ mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32);
+ mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form_loop.h
new file mode 100644
index 0000000000..b8b29741c4
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form_loop.h
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+
+int pivot_col_rounded = pivot_col/32;
+
+int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS);
+int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols);
+/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/
+
+/* zero out pivot row */
+for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) {
+ _pivot_row[i] = _mm256_set1_epi8(0);
+}
+
+/* try to get a pivot row in constant time */
+unsigned char pivot = 0;
+uint32_t pivot_is_zero = -1;
+for (int row = pivot_row_lower_bound;
+ row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+ uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row);
+ uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row);
+ __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) );
+ for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+ _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j];
+ }
+ pivot = pivot_row_bytes[pivot_col];
+ pivot_is_zero = ~ct_compare_32((int) pivot, 0);
+}
+
+/* multiply pivot row by inverse of pivot */
+inverse = inverse_f(pivot);
+__m256i inverse_multab = tbl32_gf16_multab(inverse);
+
+for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+ _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]);
+}
+
+/* conditionally write pivot row to the correct row, if there is a nonzero pivot */
+/* eliminate entries below pivot */
+for (int row = pivot_row_lower_bound; row < nrows; row++) {
+ unsigned char below_pivot = (unsigned char) (ct_is_greater_than(row, pivot_row));
+ unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col];
+
+ __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim);
+ if (row <= pivot_row_upper_bound) {
+ __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero);
+ for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) {
+ A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^
+ _mm256_shuffle_epi8(multab, _pivot_row[col]);
+ }
+ } else {
+ for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+ A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]);
+ }
+ }
+}
+
+pivot_row += (-(int32_t)(~pivot_is_zero));
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef ENABLE_CT_TESTING
+#include
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+ int i;
+ for (i = 0; i < mdeclen / 2; ++i) {
+ *mdec++ = m[i] & 0xf;
+ *mdec++ = m[i] >> 4;
+ }
+
+ if (mdeclen % 2 == 1) {
+ *mdec++ = m[i] & 0x0f;
+ }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+ int i;
+ for (i = 0; i < mlen / 2; ++i, m += 2) {
+ menc[i] = (*m) | (*(m + 1) << 4);
+ }
+
+ if (mlen % 2 == 1) {
+ menc[i] = (*m);
+ }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+ #ifndef ENABLE_PARAMS_DYNAMIC
+ (void) p;
+ #endif
+
+ const uint64_t *vPv = _vPv;
+ uint64_t temp[M_MAX/16] = {0};
+ unsigned char *temp_bytes = (unsigned char *) temp;
+ int k = 0;
+ for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+ for (int j = i; j < PARAM_k(p); j++) {
+ // multiply by X (shift up 4 bits)
+ unsigned char top = temp[k] >> 60;
+ temp[k] <<= 4;
+ k--;
+ for(; k>=0; k--){
+ temp[k+1] ^= temp[k] >> 60;
+ temp[k] <<= 4;
+ }
+ // reduce mod f(X)
+ for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+ if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+ temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+ temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+ }
+ else {
+#ifdef TARGET_BIG_ENDIAN
+ temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+ temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+ }
+ }
+
+ // extract from vPv and add
+ for(k=0; k < PARAM_m(p)/16; k ++){
+ temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+ }
+ k--;
+ }
+ }
+
+ // add to y
+ for (int i = 0; i < PARAM_m(p); i+=2)
+ {
+#ifdef TARGET_BIG_ENDIAN
+ y[i] = t[i] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+ y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+ y[i] = t[i] ^ (temp_bytes[i/2] & 0xF);
+ y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+ }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+ static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+ static const uint64_t even_bytes = 0x00ff00ff00ff00ff;
+ static const uint64_t even_2bytes = 0x0000ffff0000ffff;
+ static const uint64_t even_half = 0x00000000ffffffff;
+
+ for (size_t i = 0; i < 16; i+=2)
+ {
+ uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles;
+ M[i ] ^= t << 4;
+ M[i+1] ^= t;
+ }
+
+ for (size_t i = 0; i < 16; i+=4)
+ {
+ uint64_t t0 = ((M[i ] >> 8) ^ M[i+2]) & even_bytes;
+ uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes;
+ M[i ] ^= (t0 << 8);
+ M[i+1] ^= (t1 << 8);
+ M[i+2] ^= t0;
+ M[i+3] ^= t1;
+ }
+
+ for (size_t i = 0; i < 4; i++)
+ {
+ uint64_t t0 = ((M[i ] >> 16) ^ M[i+ 4]) & even_2bytes;
+ uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+ M[i ] ^= t0 << 16;
+ M[i+ 8] ^= t1 << 16;
+ M[i+ 4] ^= t0;
+ M[i+12] ^= t1;
+ }
+
+ for (size_t i = 0; i < 8; i++)
+ {
+ uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half;
+ M[i ] ^= t << 32;
+ M[i+8] ^= t;
+ }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+ #ifndef ENABLE_PARAMS_DYNAMIC
+ (void) p;
+ #endif
+
+ const uint64_t *VtL = _VtL;
+ int bits_to_shift = 0;
+ int words_to_shift = 0;
+ uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+ size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+ const uint64_t *Mi, *Mj;
+
+ for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+ for (int j = PARAM_k(p) - 1; j >= i; --j) {
+ // add the M_i and M_j to A, shifted "down" by l positions
+ Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+ for (int c = 0; c < PARAM_o(p); c++) {
+ for (int k = 0; k < PARAM_m(p)/16; k++)
+ {
+ A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+ if(bits_to_shift > 0){
+ A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+ }
+ }
+ }
+
+ if (i != j) {
+ Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+ for (int c = 0; c < PARAM_o(p); c++) {
+ for (int k = 0; k < PARAM_m(p)/16; k++)
+ {
+ A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+ if(bits_to_shift > 0){
+ A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+ }
+ }
+ }
+ }
+
+ bits_to_shift += 4;
+ if(bits_to_shift == 64){
+ words_to_shift ++;
+ bits_to_shift = 0;
+ }
+ }
+ }
+
+ for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+ {
+ transpose_16x16_nibbles(A + c);
+ }
+
+ unsigned char tab[F_TAIL_LEN*4] = {0};
+ for (size_t i = 0; i < F_TAIL_LEN; i++)
+ {
+ tab[4*i] = mul_f(PARAM_f_tail(p)[i],1);
+ tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+ tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+ tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+ }
+
+ uint64_t low_bit_in_nibble = 0x1111111111111111;
+
+ for (size_t c = 0; c < A_width; c+= 16)
+ {
+ for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+ {
+ size_t pos = (r/16)*A_width + c + (r%16);
+ uint64_t t0 = A[pos] & low_bit_in_nibble;
+ uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+ uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+ uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+ for (size_t t = 0; t < F_TAIL_LEN; t++)
+ {
+ A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+ }
+ }
+ }
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i)
+ A[i] = BSWAP64(A[i]);
+#endif
+
+ for (int r = 0; r < PARAM_m(p); r+=16)
+ {
+ for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+ {
+ for (size_t i = 0; i < 16; i++)
+ {
+ decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+ }
+ }
+ }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+ int ret = 0;
+
+ ret = mayo_keypair_compact(p, pk, sk);
+ if (ret != MAYO_OK) {
+ goto err;
+ }
+
+err:
+ return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *csk) {
+ int ret = MAYO_OK;
+ unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+ unsigned char y[M_MAX]; // secret data
+ unsigned char salt[SALT_BYTES_MAX]; // not secret data
+ unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+ Vdec[N_MINUS_O_MAX * K_MAX]; // secret data
+ unsigned char A[M_MAX * (K_MAX * O_MAX + 1)]; // secret data
+ unsigned char x[K_MAX * N_MAX]; // not secret data
+ unsigned char r[K_MAX * O_MAX + 1] = { 0 }; // secret data
+ unsigned char s[K_MAX * N_MAX]; // not secret data
+ const unsigned char *seed_sk;
+ unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+ alignas(32) sk_t sk; // secret data
+ unsigned char Ox[N_MINUS_O_MAX]; // secret data
+ // unsigned char Mdigest[DIGEST_BYTES];
+ unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+ unsigned char *ctrbyte;
+ unsigned char *vi;
+
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int param_o = PARAM_o(p);
+ const int param_k = PARAM_k(p);
+ const int param_m_bytes = PARAM_m_bytes(p);
+ const int param_v_bytes = PARAM_v_bytes(p);
+ const int param_r_bytes = PARAM_r_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ const int param_A_cols = PARAM_A_cols(p);
+ const int param_digest_bytes = PARAM_digest_bytes(p);
+ const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+ const int param_salt_bytes = PARAM_salt_bytes(p);
+
+ ret = mayo_expand_sk(p, csk, &sk);
+ if (ret != MAYO_OK) {
+ goto err;
+ }
+
+ seed_sk = csk;
+ decode(sk.o, O, (param_n - param_o) * param_o);
+
+ // hash message
+ shake256(tmp, param_digest_bytes, m, mlen);
+
+ uint64_t *P1 = sk.p;
+ uint64_t *L = P1 + (param_P1_bytes/8);
+ alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < param_P1_bytes / 8; ++i) {
+ P1[i] = BSWAP64(P1[i]);
+ }
+ for (int i = 0; i < param_P2_bytes / 8; ++i) {
+ L[i] = BSWAP64(L[i]);
+ }
+#endif
+
+ // choose the randomizer
+ #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+ randombytes(tmp + param_digest_bytes, param_salt_bytes);
+ #else
+ if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+ ret = MAYO_ERR;
+ goto err;
+ }
+ #endif
+
+ // hashing to salt
+ memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+ param_sk_seed_bytes);
+ shake256(salt, param_salt_bytes, tmp,
+ param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+ // hashing to t
+ memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+ ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+ shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+ decode(tenc, t, param_m); // may not be necessary
+
+ for (int ctr = 0; ctr <= 255; ++ctr) {
+ *ctrbyte = (unsigned char)ctr;
+
+ shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+ param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+ // decode the v_i vectors
+ for (int i = 0; i <= param_k - 1; ++i) {
+ decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+ param_n - param_o);
+ }
+
+ // compute all the V * L matrices.
+ // compute all the V * P1 * V^T matrices.
+ alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+ V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+ compute_rhs(p, Y, t, y);
+ compute_A(p, Mtmp, A);
+
+ decode(V + param_k * param_v_bytes, r,
+ param_k *
+ param_o);
+ if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+ break;
+ } else {
+ memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+ }
+ }
+
+ // s is already 0
+ // TODO: optimize this?
+ for (int i = 0; i <= param_k - 1; ++i) {
+ vi = Vdec + i * (param_n - param_o);
+ mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+ mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+ memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+ }
+ encode(s, sig, param_n * param_k);
+ memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+ *siglen = param_sig_bytes;
+err:
+ mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+ mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+ mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+ mayo_secure_clear(r, K_MAX * O_MAX + 1);
+ mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+ mayo_secure_clear(&sk, sizeof(sk_t));
+ mayo_secure_clear(Ox, N_MINUS_O_MAX);
+ mayo_secure_clear(tmp,
+ DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+ return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+ size_t *smlen, const unsigned char *m,
+ size_t mlen, const unsigned char *csk) {
+ int ret = MAYO_OK;
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ size_t siglen = param_sig_bytes;
+ ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+ if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+ goto err;
+
+ memmove(sm + param_sig_bytes, m, mlen);
+ *smlen = siglen + mlen;
+err:
+ return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+ size_t *mlen, const unsigned char *sm,
+ size_t smlen, const unsigned char *pk) {
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ if (smlen < (size_t)param_sig_bytes) {
+ return MAYO_ERR;
+ }
+ int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+ pk);
+
+ if (result == MAYO_OK) {
+ *mlen = smlen - param_sig_bytes;
+ memmove(m, sm + param_sig_bytes, *mlen);
+ }
+
+ return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+ unsigned char *csk) {
+ int ret = MAYO_OK;
+ unsigned char *seed_sk = csk;
+ unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+ alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+ alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+ unsigned char *seed_pk;
+ unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+ const int param_m = PARAM_m(p);
+ const int param_v = PARAM_v(p);
+ const int param_o = PARAM_o(p);
+ const int param_O_bytes = PARAM_O_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+ const int param_P3_bytes = PARAM_P3_bytes(p);
+ const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+ const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+ // seed_sk $←- B^(sk_seed bytes)
+ #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+ randombytes(seed_sk, param_sk_seed_bytes);
+ #else
+ if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+ ret = MAYO_ERR;
+ goto err;
+ }
+ #endif
+
+ // S ← shake256(seedsk, pk seed bytes + O bytes)
+ shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+ param_sk_seed_bytes);
+ // seed_pk ← s[0 : pk_seed_bytes]
+ seed_pk = S;
+
+ // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+ decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+ // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+ PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+ param_pk_seed_bytes);
+
+
+ int m_legs = param_m / 32;
+
+ uint64_t *P1 = P;
+ uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+ // compute P3 = O^t * (P1*O + P2)
+ Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+ // store seed_pk in cpk
+ memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+ alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+ // compute Upper(P3) and store in cpk
+ m_upper(m_legs, P3, P3_upper, param_o);
+
+ memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+ mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+ mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+ return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+ unsigned char *pk) {
+ #ifdef MAYO_VARIANT
+ (void)p;
+ #endif
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+ const int param_P3_bytes = PARAM_P3_bytes(p);
+ const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+ PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+ pk += param_P1_bytes + param_P2_bytes;
+ memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+ return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+ sk_t *sk) {
+ int ret = MAYO_OK;
+ unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+ uint64_t *P = sk->p;
+ unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+ const int param_o = PARAM_o(p);
+ const int param_v = PARAM_v(p);
+ const int param_O_bytes = PARAM_O_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+ const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+ const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+ const unsigned char *seed_sk = csk;
+ unsigned char *seed_pk = S;
+
+ shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+ param_sk_seed_bytes);
+ decode(S + param_pk_seed_bytes, O,
+ param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+ // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+ PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+ param_pk_seed_bytes);
+
+ uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+ P[i] = BSWAP64(P[i]);
+ }
+#endif
+
+ uint64_t *P1 = P;
+ // compute L_i = (P1 + P1^t)*O + P2
+ uint64_t *L = P2;
+ P1P1t_times_O(p, P1, O, L);
+
+ // write to sk
+ memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+ P[i] = BSWAP64(P[i]);
+ }
+#endif
+
+ mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+ mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+ return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+ size_t mlen, const unsigned char *sig,
+ const unsigned char *cpk) {
+ unsigned char tEnc[M_BYTES_MAX];
+ unsigned char t[M_MAX];
+ unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+ unsigned char s[K_MAX * N_MAX];
+ alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+ unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int param_v = PARAM_v(p);
+ const int param_o = PARAM_o(p);
+ const int param_k = PARAM_k(p);
+ const int param_m_bytes = PARAM_m_bytes(p);
+ const int param_P1_bytes = PARAM_P1_bytes(p);
+ const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+ const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+ const int param_sig_bytes = PARAM_sig_bytes(p);
+ const int param_digest_bytes = PARAM_digest_bytes(p);
+ const int param_salt_bytes = PARAM_salt_bytes(p);
+
+ int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+ if (ret != MAYO_OK) {
+ return MAYO_ERR;
+ }
+
+ uint64_t *P1 = pk;
+ uint64_t *P2 = pk + (param_P1_bytes / 8);
+ uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+ for (int i = 0; i < param_P1_bytes / 8; ++i) {
+ P1[i] = BSWAP64(P1[i]);
+ }
+ for (int i = 0; i < param_P2_bytes / 8; ++i) {
+ P2[i] = BSWAP64(P2[i]);
+ }
+ for (int i = 0; i < param_P3_bytes / 8; ++i) {
+ P3[i] = BSWAP64(P3[i]);
+ }
+#endif
+
+ // hash m
+ shake256(tmp, param_digest_bytes, m, mlen);
+
+ // compute t
+ memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+ param_salt_bytes);
+ shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+ decode(tEnc, t, param_m);
+
+ // decode s
+ decode(sig, s, param_k * param_n);
+
+ // Compute S*P*S^T
+ alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+ m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+ param_v, param_o, param_k, SPS);
+
+ // combine the vectors in SPS and reduce mod f(X)
+ compute_rhs(p, SPS, y, y);
+
+ if (memcmp(y, t, param_m) == 0) {
+ return MAYO_OK; // good signature
+ }
+ return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include
+#include
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64 \
+ { 8, 0, 2, 8, 0 } // f(z) = z^64 + x^3*z^3 + x*z^2 + x^3
+#define F_TAIL_96 \
+ { 2, 2, 0, 2, 0 } // f(z) = z^96 + x*z^3 + x*z + x
+#define F_TAIL_128 \
+ { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3 + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+ int m;
+ int n;
+ int o;
+ int k;
+ int q;
+ const unsigned char *f_tail;
+ int m_bytes;
+ int O_bytes;
+ int v_bytes;
+ int r_bytes;
+ int R_bytes;
+ int P1_bytes;
+ int P2_bytes;
+ int P3_bytes;
+ int csk_bytes;
+ int esk_bytes;
+ int cpk_bytes;
+ int epk_bytes;
+ int sig_bytes;
+ int salt_bytes;
+ int sk_seed_bytes;
+ int digest_bytes;
+ int pk_seed_bytes;
+ const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+ uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+ uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+ size_t *smlen, const unsigned char *m,
+ size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+ size_t *mlen, const unsigned char *sm,
+ size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+ unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+ unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+ sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+ size_t mlen, const unsigned char *sig,
+ const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/mem.h b/src/sig/mayo/pqmayo_mayo-2_avx2/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include
+#include
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+ int32_t diff = b - a;
+ return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+ int64_t diff = ((int64_t) b) - ((int64_t) a);
+ return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+ return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+ return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+ return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include
+/**
+ * Clears and frees allocated memory.
+ *
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+ OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ *
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+ OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/params.c b/src/sig/mayo/pqmayo_mayo-2_avx2/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+ const mayo_params_t nm = { \
+ .m = PARAM_JOIN2(nm, m), \
+ .n = PARAM_JOIN2(nm, n), \
+ .o = PARAM_JOIN2(nm, o), \
+ .k = PARAM_JOIN2(nm, k), \
+ .q = PARAM_JOIN2(nm, q), \
+ .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+ .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+ .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+ .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+ .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+ .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+ .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+ .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+ .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+ .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+ .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+ .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+ .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+ .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+ .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+ .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+ .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+ .name = #nm \
+ };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_128.h
new file mode 100644
index 0000000000..27b416adce
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_128.h
@@ -0,0 +1,524 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_128_H
+#define SHUFFLE_ARITHMETIC_128_H
+
+#include
+#include
+#include
+#include
+
+
+// P1*0 -> P1: v x v, O: v x o
+static
+inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ for (size_t c = r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ cols_used ++;
+ __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ acc[(2*r*O_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+ acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0;
+ acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1;
+ }
+ }
+}
+
+static
+inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+ const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+ acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1;
+ }
+ }
+}
+
+
+static
+inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ cols_used += 1;
+ size_t pos = r;
+ for (size_t c = 0; c < r; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ pos += (V_MAX -c - 1);
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ for (size_t c = r+1; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k ));
+ __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1));
+ __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2));
+ __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3));
+
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k), acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1);
+ }
+ }
+}
+
+
+static
+inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+ const __m256i *L = (__m256i *) _L;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(k*O_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ acc[2*(k*O_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+
+static
+inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+
+ for (c=r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+static
+inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+ const __m256i *Pv = (__m256i *) _Pv;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static
+inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P1 = (__m256i *) _P1;
+ const __m256i *P2 = (__m256i *) _P2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t P1_cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+
+
+ // P1 * S1
+ for (c=r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ P1_cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // P2 * S2
+ for (c=0; c < O_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static
+inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P3 = (__m256i *) _P3;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+
+ for (c=r; c < O_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+
+static
+inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+ mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+ const __m256i *PS2 = (__m256i *) _PS2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c);
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1);
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+ acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+ acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+ }
+#if K_MAX % 2 == 1
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+ acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+ }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_64.h
new file mode 100644
index 0000000000..defff86f8f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_64.h
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_64_H
+#define SHUFFLE_ARITHMETIC_64_H
+
+#include
+#include
+#include
+#include
+
+// P1*0 -> P1: v x v, O: v x o
+static
+inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[O_MAX] = {0};
+ for (size_t c = r; c < V_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+ temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t;
+ }
+ }
+}
+
+
+static
+inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+
+ const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[O_MAX] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd);
+ temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+ }
+ }
+}
+
+static
+inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[O_MAX] = {0};
+ cols_used += 1;
+ size_t pos = r;
+ for (size_t c = 0; c < r; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1 + pos);
+ pos += (V_MAX -c - 1);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+ temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+ }
+ }
+
+ for (size_t c = r+1; c < V_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+ temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+ }
+ }
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k ));
+ __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1));
+
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256(acc + (r*O_MAX + k ), acc0 ^ temp[k ] ^ _mm256_slli_epi16(t,4));
+ _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t);
+ }
+ }
+}
+
+
+static
+inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+ const __m256i *L = (__m256i *) _L;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+
+static
+inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+ const __m256i *Pv = (__m256i *) _Pv;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+static
+inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P1 = (__m256i *) _P1;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+
+ for (c=r; c < V_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular
+// same as mayo_12_P1_times_Vt_avx2
+static
+inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){
+ mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+ mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+ const __m256i *PS2 = (__m256i *) _PS2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+ size_t k;
+
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static
+inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P2 = (__m256i *) _P2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+
+ for (c=0; c < O_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P2 + cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static
+inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P1 = (__m256i *) _P1;
+ const __m256i *P2 = (__m256i *) _P2;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t P1_cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+
+
+ // P1 * S1
+ for (c=r; c < V_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ P1_cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even);
+ }
+ }
+
+ // P2 * S2
+ for (c=0; c < O_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static
+inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+ size_t k,c;
+ const __m256i *P3 = (__m256i *) _P3;
+ __m256i *acc = (__m256i *) _acc;
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2] = {0};
+
+ for (c=r; c < O_MAX; c++)
+ {
+ __m256i in_odd = _mm256_loadu_si256(P3 + cols_used);
+ __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+ in_odd &= low_nibble_mask;
+ cols_used ++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k + 1 < K_MAX; k+=2)
+ {
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+ acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+ }
+#if K_MAX % 2 == 1
+ __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+ acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+ }
+}
+
+
+static inline
+void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+ (void) size;
+ int m_vecs_stored = 0;
+
+ for (int r = 0; r < O_MAX; ++r) {
+ const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r));
+ __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+ _out[0] = _in[0];
+ m_vecs_stored++;
+ for (int c = r + 1; c < O_MAX; ++c) {
+ const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c));
+ const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r));
+ _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+ _out[0] = _in2[0] ^ _in3[0];
+ m_vecs_stored++;
+ }
+ }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_96.h
new file mode 100644
index 0000000000..9b3a69d567
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_96.h
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_96_H
+#define SHUFFLE_ARITHMETIC_96_H
+
+#include
+#include
+#include
+#include
+
+
+// P1*0 -> P1: v x v, O: v x o
+static
+inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){
+
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ for (size_t c = r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+ }
+}
+
+static
+inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){
+
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+ }
+}
+
+static
+inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ cols_used ++;
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[2*O_MAX] = {0};
+ size_t pos = r;
+ for (size_t c = 0; c < r; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ pos += (V_MAX -c - 1);
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ for (size_t c = r+1; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used ++;
+
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ temp[2*k] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+ temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+ temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+ temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (size_t k = 0; k < O_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+ }
+}
+
+static
+inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){
+
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k;
+ for (size_t c = 0; c < O_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+static
+inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k,c;
+ size_t cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (c = r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+static
+inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k;
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+// P2*S2 -> P2: v x o, S2: o x k
+static
+inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k,c;
+ size_t P1_cols_used = 0;
+ for (size_t r = 0; r < V_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ // P1 times S1
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (c=r; c < V_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ P1_cols_used++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // P2 times S2
+ for (c=0; c < O_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static
+inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k,c;
+ size_t cols_used = 0;
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (c=r; c < O_MAX; c++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+ cols_used++;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+static
+inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){
+ mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc);
+}
+
+static
+inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){
+ const __m256i low_nibble_mask = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+ size_t k;
+ for (size_t c = 0; c < K_MAX; c++)
+ {
+ // do multiplications for one row and accumulate results in temporary format
+ __m256i temp[K_OVER_2*2*2] = {0};
+ for (size_t r = 0; r < O_MAX; r++)
+ {
+ __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements
+ __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+ in_odd0 &= low_nibble_mask;
+ __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96)
+ __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+ in_odd1 &= low_nibble_mask;
+
+ for (k = 0; k < K_OVER_2; k++)
+ {
+ temp[4*k] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+ temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+ temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+ temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+ }
+ }
+
+ // convert to normal format and add to accumulator
+ for (k = 0; k+1 < K_MAX; k+=2)
+ {
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+ __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+ __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6), acc2 ^ temp[2*k + 1] ^ t0);
+ _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+ }
+#if K_MAX % 2 == 1
+ __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+ __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+ __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k ],4)) & low_nibble_mask;
+ __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6), acc0 ^ temp[2*k ] ^ _mm256_slli_epi16(t0,4));
+ _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2), acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+ }
+}
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-2_avx2/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+ // carryless multiply
+ unsigned char p;
+ p = (a & 1)*b;
+ p ^= (a & 2)*b;
+ p ^= (a & 4)*b;
+ p ^= (a & 8)*b;
+
+ // reduce mod x^4 + x + 1
+ unsigned char top_p = p & 0xf0;
+ unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+ return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+ // carryless multiply
+ uint64_t p;
+ p = (a & 1)*b;
+ p ^= (a & 2)*b;
+ p ^= (a & 4)*b;
+ p ^= (a & 8)*b;
+
+ // reduce mod x^4 + x + 1
+ uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+ uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+ return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+ return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+ return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+ return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+ // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+ // 10, 4, 3, 8}; return table[a & 15];
+
+ unsigned char a2 = mul_f(a, a);
+ unsigned char a4 = mul_f(a2, a2);
+ unsigned char a8 = mul_f(a4, a4);
+ unsigned char a6 = mul_f(a2, a4);
+ unsigned char a14 = mul_f(a8, a6);
+
+ return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+ const unsigned char *b, int n, int m) {
+ unsigned char ret = 0;
+ for (int i = 0; i < n; ++i, b += m) {
+ ret = add_f(mul_f(a[i], *b), ret);
+ }
+ return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+ unsigned char *c, int colrow_ab, int row_a, int col_b) {
+ for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+ for (int j = 0; j < col_b; ++j, ++c) {
+ *c = lincomb(a, b + j, colrow_ab, col_b);
+ }
+ }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+ unsigned char *c, int m, int n) {
+ for (int i = 0; i < m; ++i) {
+ for (int j = 0; j < n; ++j) {
+ *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+ }
+ }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+ uint64_t *out) {
+ for (int i = 0; i < m_legs * 2; i++) {
+ out[i] = in[i];
+ }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+ uint64_t *acc) {
+ for (int i = 0; i < m_legs * 2; i++) {
+ acc[i] ^= in[i];
+ }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+ uint32_t a_msb;
+ uint32_t a32 = a;
+ uint32_t b32 = b;
+ uint32_t r32 = a32 * (b32 & 1);
+
+ a_msb = a32 & 0x88888888; // MSB, 3rd bits
+ a32 ^= a_msb; // clear MSB
+ a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+ r32 ^= (a32) * ((b32 >> 1) & 1);
+
+ a_msb = a32 & 0x88888888; // MSB, 3rd bits
+ a32 ^= a_msb; // clear MSB
+ a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+ r32 ^= (a32) * ((b32 >> 2) & 1);
+
+ a_msb = a32 & 0x88888888; // MSB, 3rd bits
+ a32 ^= a_msb; // clear MSB
+ a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+ r32 ^= (a32) * ((b32 >> 3) & 1);
+
+ return r32;
+
+}
+
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+ for(int i=0; i < m_legs*2;i++){
+ acc[i] ^= gf16v_mul_u64(in[i], a);
+ }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+ for(int i=0;i
+#include
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+ const unsigned char *input, size_t inputByteLen) {
+ (void) inputByteLen;
+ uint8_t iv[12] = { 0 };
+ aes128ctr_prf(output, outputByteLen, input, iv);
+ return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/api.c b/src/sig/mayo/pqmayo_mayo-2_opt/api.c
new file mode 100644
index 0000000000..a7cf85eedf
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_2
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+ return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *sk) {
+ return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *sk) {
+ return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+ const unsigned char *sm, size_t smlen,
+ const unsigned char *pk) {
+ return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *pk) {
+ if (siglen != CRYPTO_BYTES)
+ return -1;
+ return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/api.h b/src/sig/mayo/pqmayo_mayo-2_opt/api.h
new file mode 100644
index 0000000000..265a5639db
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include
+
+#define CRYPTO_SECRETKEYBYTES 24
+#define CRYPTO_PUBLICKEYBYTES 5488
+#define CRYPTO_BYTES 180
+
+#define CRYPTO_ALGNAME "MAYO-2"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+ size_t *siglen, const unsigned char *m,
+ size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+ const unsigned char *sm, size_t smlen,
+ const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+ const unsigned char *m, size_t mlen,
+ const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+#include
+#include
+#include
+#include
+#ifdef ENABLE_CT_TESTING
+#include
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+ mayo12_m_upper(m_legs, in, out, size);
+#else
+ int m_vecs_stored = 0;
+ for (int r = 0; r < size; r++) {
+ for (int c = r; c < size; c++) {
+ m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+ if (r != c) {
+ m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+ }
+ m_vecs_stored ++;
+ }
+ }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ (void) p;
+ mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ (void) p;
+ mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ (void) p;
+ mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+ #ifndef MAYO_VARIANT
+ const int m_legs = PARAM_m(p) / 32;
+ #else
+ (void) p;
+ #endif
+ const int param_o = PARAM_o(p);
+ const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+ int
+ bs_mat_entries_used = 0;
+ for (int r = 0; r < param_v; r++) {
+ for (int c = r; c < param_v; c++) {
+ if(c==r) {
+ bs_mat_entries_used += 1;
+ continue;
+ }
+ for (int k = 0; k < param_o; k += 1) {
+
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+ vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k], acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k], acc + 6 * (r * param_o + k));
+ vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k], acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k], acc + 8 * (r * param_o + k));
+ vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k], acc + 8 * (c * param_o + k));
+#else
+ m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+ m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+ }
+ bs_mat_entries_used += 1;
+ }
+ }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+ (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mayo_V_multabs_avx2(V, V_multabs);
+ mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+ mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+ mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int param_o = PARAM_o(p);
+ const int param_k = PARAM_k(p);
+
+ alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+ mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+ param_k, param_n - param_o, param_o);
+
+ mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+ mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+ Y, param_k, param_n - param_o,
+ param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+ (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+ __m256i O_multabs[O_MAX/2*V_MAX];
+ mayo_O_multabs_avx2(O, O_multabs);
+ mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+ mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ const int param_v = PARAM_v(p);
+ const int param_o = PARAM_o(p);
+ const int param_m = PARAM_m(p);
+ const int param_n = PARAM_n(p);
+ const int m_legs = PARAM_m(p) / 32;
+ mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+ mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2]
+// [ 0 P3 ] [S2] [ P3*S2]
+// compute S * PS = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+// [ P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+ const int m, const int v, const int o, const int k, uint64_t *SPS) {
+ (void) m;
+#if MAYO_AVX
+ const int n = o + v;
+
+ /* Old approach which is constant time but doesn't have to be */
+ unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+ unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+ unsigned char *s1_write = S1;
+ unsigned char *s2_write = S2;
+
+ for (int r=0; r < k; r++)
+ {
+ for (int c = 0; c < n; c++)
+ {
+ if(c < v){
+ *(s1_write++) = S[r*n + c];
+ } else {
+ *(s2_write++) = S[r*n + c];
+ }
+ }
+ }
+
+ alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+
+ mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+
+ mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+ __m256i S1_multabs[(K_MAX+1)/2*V_MAX];
+ __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+ mayo_S1_multabs_avx2(S1, S1_multabs);
+ mayo_S2_multabs_avx2(S2, S2_multabs);
+ mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+ mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+ // S^T * PS = S1^t*PS1 + S2^t*PS2
+ //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+ mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+ mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else
+ NOT IMPLEMENTED
+#endif
+#else
+ const int n = o + v;
+ alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+ mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+ mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+ const unsigned char *y, const unsigned char *r,
+ unsigned char *x, int k, int o, int m, int A_cols) {
+ #ifdef MAYO_VARIANT
+ (void) p;
+ #endif
+ unsigned char finished;
+ int col_upper_bound;
+ unsigned char correct_column;
+
+ // x <- r
+ for (int i = 0; i < k * o; i++) {
+ x[i] = r[i];
+ }
+
+ // compute Ar;
+ unsigned char Ar[M_MAX];
+ for (int i = 0; i < m; i++) {
+ A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+ }
+ mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+ // move y - Ar to last column of matrix A
+ for (int i = 0; i < m; i++) {
+ A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+ }
+
+ EF(A, m, k * o + 1);
+
+ // check if last row of A (excluding the last entry of y) is zero
+ unsigned char full_rank = 0;
+ for (int i = 0; i < A_cols - 1; i++) {
+ full_rank |= A[(m - 1) * A_cols + i];
+ }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+ VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+ if (full_rank == 0) {
+ return 0;
+ }
+
+ // back substitution in constant time
+ // the index of the first nonzero entry in each row is secret, which makes
+ // things less efficient
+
+ for (int row = m - 1; row >= 0; row--) {
+ finished = 0;
+ col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+ // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+ for (int col = row; col <= col_upper_bound; col++) {
+
+ // Compare two chars in constant time.
+ // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+ correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+ unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+ x[col] ^= u;
+
+ for (int i = 0; i < row; i += 8) {
+ uint64_t tmp = ( (uint64_t) A[ i * A_cols + col] << 0) ^ ( (uint64_t) A[(i+1) * A_cols + col] << 8)
+ ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+ ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+ ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+ tmp = mul_fx8(u, tmp);
+
+ A[ i * A_cols + A_cols - 1] ^= (tmp ) & 0xf;
+ A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+ A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+ A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+ A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+ A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+ A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+ A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+ }
+
+ finished = finished | correct_column;
+ }
+ }
+ return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include
+#include
+#include
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+ #include
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+ #include
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+ #include
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+ #include
+ #include
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+ #include
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+ const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_128.h
new file mode 100644
index 0000000000..418c308e2f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_128.h
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include
+#include
+#include
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+ out[4] = in[4];
+ out[5] = in[5];
+ out[6] = in[6];
+ out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+ acc[4] ^= in[4];
+ acc[5] ^= in[5];
+ acc[6] ^= in[6];
+ acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_msb = 0x8888888888888888ULL;
+ for(int i=0; i < 8;i++){
+ uint64_t t = in[i] & mask_msb;
+ acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_lsb = 0x1111111111111111ULL;
+ for(int i=0; i < 8;i++){
+ uint64_t t = in[i] & mask_lsb;
+ acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+ }
+}
+
+static
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ uint32_t tab = mul_table(a);
+
+ uint64_t lsb_ask = 0x1111111111111111ULL;
+
+ for(int i=0; i < 8;i++){
+ acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff)
+ ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf)
+ ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+ ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+ }
+}
+
+static
+inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_128(bins + 5 * 8, bins + 10 * 8);
+ m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+ m_vec_mul_add_x_inv_128(bins + 10 * 8, bins + 7 * 8);
+ m_vec_mul_add_x_128(bins + 12 * 8, bins + 6 * 8);
+ m_vec_mul_add_x_inv_128(bins + 7 * 8, bins + 14 * 8);
+ m_vec_mul_add_x_128(bins + 6 * 8, bins + 3 * 8);
+ m_vec_mul_add_x_inv_128(bins + 14 * 8, bins + 15 * 8);
+ m_vec_mul_add_x_128(bins + 3 * 8, bins + 8 * 8);
+ m_vec_mul_add_x_inv_128(bins + 15 * 8, bins + 13 * 8);
+ m_vec_mul_add_x_128(bins + 8 * 8, bins + 4 * 8);
+ m_vec_mul_add_x_inv_128(bins + 13 * 8, bins + 9 * 8);
+ m_vec_mul_add_x_128(bins + 4 * 8, bins + 2 * 8);
+ m_vec_mul_add_x_inv_128(bins + 9 * 8, bins + 1 * 8);
+ m_vec_mul_add_x_128(bins + 2 * 8, bins + 1 * 8);
+ vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_64.h
new file mode 100644
index 0000000000..a70b7a3118
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_64.h
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include
+#include
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+ uint64_t mask_msb = 0x8888888888888888ULL;
+ uint64_t a_msb;
+ uint64_t a64 = a;
+ uint64_t b32 = b;
+ uint64_t r64 = a64 * (b32 & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 1) & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 2) & 1);
+
+ a_msb = a64 & mask_msb; // MSB, 3rd bits
+ a64 ^= a_msb; // clear MSB
+ a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+ r64 ^= (a64) * ((b32 >> 3) & 1);
+
+ return r64;
+}
+
+static inline uint32_t mul_table(uint8_t b){
+ uint32_t x = ((uint32_t) b) * 0x08040201;
+
+ uint32_t high_nibble_mask = 0xf0f0f0f0;
+
+ uint32_t high_half = x & high_nibble_mask;
+ return (x ^ (high_half >> 4) ^ (high_half >> 3));
+}
+
+static
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ uint32_t tab = mul_table(a);
+
+ uint64_t lsb_ask = 0x1111111111111111ULL;
+
+ for(int i=0; i < 4;i++){
+ acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff)
+ ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf)
+ ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+ ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+ }
+}
+
+static
+inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+ uint32_t tab = mul_table(a);
+
+ uint64_t lsb_ask = 0x1111111111111111ULL;
+
+ for(int i=0; i < legs; i++){
+ acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff)
+ ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf)
+ ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+ ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_msb = 0x8888888888888888ULL;
+ for(int i=0; i < 4;i++){
+ uint64_t t = in[i] & mask_msb;
+ acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_lsb = 0x1111111111111111ULL;
+ for(int i=0; i < 4;i++){
+ uint64_t t = in[i] & mask_lsb;
+ acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+ }
+}
+
+static
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_64(bins + 5 * 4, bins + 10 * 4);
+ m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+ m_vec_mul_add_x_inv_64(bins + 10 * 4, bins + 7 * 4);
+ m_vec_mul_add_x_64(bins + 12 * 4, bins + 6 * 4);
+ m_vec_mul_add_x_inv_64(bins + 7 * 4, bins + 14 * 4);
+ m_vec_mul_add_x_64(bins + 6 * 4, bins + 3 * 4);
+ m_vec_mul_add_x_inv_64(bins + 14 * 4, bins + 15 * 4);
+ m_vec_mul_add_x_64(bins + 3 * 4, bins + 8 * 4);
+ m_vec_mul_add_x_inv_64(bins + 15 * 4, bins + 13 * 4);
+ m_vec_mul_add_x_64(bins + 8 * 4, bins + 4 * 4);
+ m_vec_mul_add_x_inv_64(bins + 13 * 4, bins + 9 * 4);
+ m_vec_mul_add_x_64(bins + 4 * 4, bins + 2 * 4);
+ m_vec_mul_add_x_inv_64(bins + 9 * 4, bins + 1 * 4);
+ m_vec_mul_add_x_64(bins + 2 * 4, bins + 1 * 4);
+ vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_96.h
new file mode 100644
index 0000000000..a38f89e454
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_96.h
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include
+#include
+#include
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+ out[0] = in[0];
+ out[1] = in[1];
+ out[2] = in[2];
+ out[3] = in[3];
+ out[4] = in[4];
+ out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+ acc[0] ^= in[0];
+ acc[1] ^= in[1];
+ acc[2] ^= in[2];
+ acc[3] ^= in[3];
+ acc[4] ^= in[4];
+ acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_msb = 0x8888888888888888ULL;
+ for(int i=0; i < 6;i++){
+ uint64_t t = in[i] & mask_msb;
+ acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+ }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+ uint64_t mask_lsb = 0x1111111111111111ULL;
+ for(int i=0; i < 6;i++){
+ uint64_t t = in[i] & mask_lsb;
+ acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+ }
+}
+
+static
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+ uint32_t tab = mul_table(a);
+
+ uint64_t lsb_ask = 0x1111111111111111ULL;
+
+ for(int i=0; i < 6;i++){
+ acc[i] ^= ( in[i] & lsb_ask) * (tab & 0xff)
+ ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8) & 0xf)
+ ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+ ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+ }
+}
+
+static
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+ m_vec_mul_add_x_inv_96(bins + 5 * 6, bins + 10 * 6);
+ m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+ m_vec_mul_add_x_inv_96(bins + 10 * 6, bins + 7 * 6);
+ m_vec_mul_add_x_96(bins + 12 * 6, bins + 6 * 6);
+ m_vec_mul_add_x_inv_96(bins + 7 * 6, bins + 14 * 6);
+ m_vec_mul_add_x_96(bins + 6 * 6, bins + 3 * 6);
+ m_vec_mul_add_x_inv_96(bins + 14 * 6, bins + 15 * 6);
+ m_vec_mul_add_x_96(bins + 3 * 6, bins + 8 * 6);
+ m_vec_mul_add_x_inv_96(bins + 15 * 6, bins + 13 * 6);
+ m_vec_mul_add_x_96(bins + 8 * 6, bins + 4 * 6);
+ m_vec_mul_add_x_inv_96(bins + 13 * 6, bins + 9 * 6);
+ m_vec_mul_add_x_96(bins + 4 * 6, bins + 2 * 6);
+ m_vec_mul_add_x_inv_96(bins + 9 * 6, bins + 1 * 6);
+ m_vec_mul_add_x_96(bins + 2 * 6, bins + 1 * 6);
+ vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_common.h
new file mode 100644
index 0000000000..d337bc238c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_common.h
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include
+
+#ifndef MAYO_VARIANT
+static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) {
+
+ m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2);
+ m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 3 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 8 * m_legs * 2);
+ m_vec_add(m_legs, bins + 14 * m_legs * 2, bins + 6 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2);
+ m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 7 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 8 * m_legs * 2);
+ m_vec_add(m_legs, bins + 12 * m_legs * 2, bins + 4 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 9 * m_legs * 2);
+ m_vec_add(m_legs, bins + 11 * m_legs * 2, bins + 2 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 8 * m_legs * 2);
+ m_vec_add(m_legs, bins + 10 * m_legs * 2, bins + 2 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 8 * m_legs * 2);
+ m_vec_add(m_legs, bins + 9 * m_legs * 2, bins + 1 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 4 * m_legs * 2);
+ m_vec_add(m_legs, bins + 7 * m_legs * 2, bins + 3 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 4 * m_legs * 2);
+ m_vec_add(m_legs, bins + 6 * m_legs * 2, bins + 2 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 4 * m_legs * 2);
+ m_vec_add(m_legs, bins + 5 * m_legs * 2, bins + 1 * m_legs * 2);
+
+ m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 2 * m_legs * 2);
+ m_vec_add(m_legs, bins + 3 * m_legs * 2, bins + 1 * m_legs * 2);
+
+ m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2);
+ m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2);
+ m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2);
+
+ m_vec_copy(m_legs, bins + 1 * m_legs * 2, out);
+}
+#endif
+
+// compute P * S^t = [ P1 P2 ] * [S1] = [P1*S1 + P2*S2]
+// [ 0 P3 ] [S2] [ P3*S2]
+static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+ const int m, const int v, const int o, const int k, uint64_t *PS) {
+
+ const int n = o + v;
+#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128)
+ (void)m;
+#else
+ const int m_legs = m / 32;
+#endif
+
+ /* Old approach which is constant time but doesn't have to be
+ unsigned char S1[V_MAX*K_MAX];
+ unsigned char S2[O_MAX*K_MAX];
+ unsigned char *s1_write = S1;
+ unsigned char *s2_write = S2;
+ for (int r=0; r < k; r++)
+ {
+ for (int c = 0; c < n; c++)
+ {
+ if(c < v){
+ *(s1_write++) = S[r*n + c];
+ } else {
+ *(s2_write++) = S[r*n + c];
+ }
+ }
+ }
+
+ mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1
+ mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2
+ mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2.
+ */
+
+ // use more stack efficient version for MAYO_3 and MAYO_5
+ #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78
+ uint64_t accumulator[M_MAX * N_MAX] = {0};
+ int P1_used;
+ int P3_used;
+ for (int col = 0; col < k; col++) {
+ for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) {
+ accumulator[i] = 0;
+ }
+ P1_used = 0;
+ for (int row = 0; row < v; row++) {
+ for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8);
+#else
+ bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+ P1_used ++;
+ }
+
+ for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 );
+#else
+ bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+#endif
+ }
+ }
+
+ P3_used = 0;
+ for (int row = v; row < n; row++) {
+ for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8);
+#else
+ bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+ P3_used ++;
+ }
+ }
+
+ for (int row = 0; row < n; row++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8);
+#else
+ bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2);
+#endif
+ }
+ }
+
+ #else
+
+ alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0};
+ int P1_used = 0;
+ for (int row = 0; row < v; row++) {
+ for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+ vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+ vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+ vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+ for (int col = 0; col < k; col++) {
+ m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+ }
+#endif
+ P1_used ++;
+ }
+
+
+ for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+ vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 );
+ vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 );
+ vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 );
+#else
+ for (int col = 0; col < k; col++) {
+ m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+ }
+#endif
+ }
+ }
+
+ int P3_used = 0;
+ for (int row = v; row < n; row++) {
+ for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+ vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+ vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+ vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+ for (int col = 0; col < k; col++) {
+ m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+ }
+#endif
+ P3_used ++;
+ }
+ }
+
+ // multiply stuff according to the bins of the accumulator and add to PS.
+ int i = 0;
+ while (i < n * k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4);
+ i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6);
+ i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8);
+ i++;
+#else
+ m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2);
+ i++;
+#endif
+ }
+
+ #endif
+}
+
+
+static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int n, uint64_t *SPS){
+ alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0};
+ #if !defined(MAYO_VARIANT)
+ const int m_legs = m/32;
+ #else
+ (void) m;
+ #endif
+ for (int row = 0; row < k; row++) {
+ for (int j = 0; j < n; j++) {
+ for (int col = 0; col < k; col += 1) {
+ #if defined(MAYO_VARIANT) && (M_MAX == 64)
+ vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 );
+ #elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 );
+ #elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 );
+ #else
+ m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 );
+ #endif
+ }
+ }
+ }
+
+ // multiply stuff according to the bins of the accumulator and add to PS.
+ int i = 0;
+ while (i < k*k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4);
+ i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6);
+ i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8);
+ i++;
+#else
+ m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2);
+ i++;
+#endif
+ }
+}
+
+
+// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) {
+
+ int bs_mat_entries_used = 0;
+ for (int r = 0; r < bs_mat_rows; r++) {
+ for (int c = triangular * r; c < bs_mat_cols; c++) {
+ for (int k = 0; k < mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ (void) m_legs;
+ vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ (void) m_legs;
+ vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ (void) m_legs;
+ vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k));
+#else
+ m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k));
+#endif
+ }
+ bs_mat_entries_used += 1;
+ }
+ }
+}
+
+// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) {
+
+ int bs_mat_entries_used = 0;
+ for (int r = 0; r < bs_mat_rows; r++) {
+ for (int c = triangular * r; c < bs_mat_cols; c++) {
+ for (int k = 0; k < mat_rows; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ (void) m_legs;
+ vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ (void) m_legs;
+ vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ (void) m_legs;
+ vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k));
+#else
+ m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k));
+#endif
+ }
+ bs_mat_entries_used += 1;
+ }
+ }
+}
+
+// multiplies the transpose of a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+ for (int r = 0; r < mat_cols; r++) {
+ for (int c = 0; c < mat_rows; c++) {
+ for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ (void) m_legs;
+ vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ (void) m_legs;
+ vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ (void) m_legs;
+ vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k));
+#else
+ m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+ }
+ }
+ }
+}
+
+// multiplies a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+ for (int r = 0; r < mat_rows; r++) {
+ for (int c = 0; c < mat_cols; c++) {
+ for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+ (void) m_legs;
+ vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+ (void) m_legs;
+ vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+ (void) m_legs;
+ vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k));
+#else
+ m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+ }
+ }
+ }
+}
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo-2_opt/echelon_form.h
new file mode 100644
index 0000000000..82505847c9
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/echelon_form.h
@@ -0,0 +1,152 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ECHELON_FORM_H
+#define ECHELON_FORM_H
+
+#include
+#include
+#include
+#include
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+static inline unsigned char
+m_extract_element(const uint64_t *in, int index) {
+ const int leg = index / 16;
+ const int offset = index % 16;
+
+ return (in[leg] >> (offset*4)) & 0xF;
+}
+
+static inline void
+ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) {
+ int i;
+ unsigned char *out8 = (unsigned char *)out;
+ for(i = 0; i+1 < ncols; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+ out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0) | (in[i+1] << 4);
+#else
+ out8[i/2] = (in[i+0] << 0) | (in[i+1] << 4);
+#endif
+ }
+ if (ncols % 2 == 1){
+#ifdef TARGET_BIG_ENDIAN
+ out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] = (in[i+0] << 0);
+#else
+ out8[i/2] = (in[i+0] << 0);
+#endif
+ }
+}
+
+static inline void
+ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) {
+ const unsigned char *in8 = (const unsigned char *)in;
+ for(int i = 0; i < legs * 16; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+ out[i] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF;
+ out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+ out[i] = (in8[i/2]) & 0xF;
+ out[i+1] = (in8[i/2] >> 4);
+#endif
+ }
+}
+
+
+// put matrix in row echelon form with ones on first nonzero entries *in
+// constant time*
+static inline void EF(unsigned char *A, int nrows, int ncols) {
+
+ alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16];
+ alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16];
+ alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 };
+
+ int row_len = (ncols + 15) / 16;
+
+ // nibbleslice the matrix A
+ for (int i = 0; i < nrows; i++) {
+ ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols);
+ }
+
+ // pivot row is secret, pivot col is not
+
+ unsigned char inverse;
+ int pivot_row = 0;
+ for (int pivot_col = 0; pivot_col < ncols; pivot_col++) {
+
+ int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols);
+ int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col);
+ // the pivot row is guaranteed to be between these lower and upper bounds if
+ // A has full rank
+
+ // zero out pivot row
+ for (int i = 0; i < row_len; i++) {
+ _pivot_row[i] = 0;
+ _pivot_row2[i] = 0;
+ }
+
+ // try to get a pivot row in constant time
+ unsigned char pivot = 0;
+ uint64_t pivot_is_zero = -1;
+ for (int row = pivot_row_lower_bound;
+ row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+
+ uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row);
+ uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row);
+
+ for (int j = 0; j < row_len; j++) {
+ _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) &
+ packed_A[row * row_len + j];
+ }
+ pivot = m_extract_element(_pivot_row, pivot_col);
+ pivot_is_zero = ~ct_compare_64((int) pivot, 0);
+ }
+
+ // multiply pivot row by inverse of pivot
+ inverse = inverse_f(pivot);
+ vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2);
+
+ // conditionally write pivot row to the correct row, if there is a nonzero
+ // pivot
+ for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) {
+ uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero;
+ uint64_t do_not_copy = ~do_copy;
+ for (int col = 0; col < row_len; col++) {
+ packed_A[row * row_len + col] =
+ (do_not_copy & packed_A[row * row_len + col]) +
+ (do_copy & _pivot_row2[col]);
+ }
+ }
+
+ // eliminate entries below pivot
+ for (int row = pivot_row_lower_bound; row < nrows; row++) {
+ unsigned char below_pivot = (row > pivot_row);
+ unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col);
+
+ vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim,
+ packed_A + row * row_len);
+ }
+
+ pivot_row += (-(int64_t)(~pivot_is_zero));
+ }
+
+ unsigned char temp[(O_MAX * K_MAX + 1 + 15)];
+
+ // unbitslice the matrix A
+ for (int i = 0; i < nrows; i++) {
+ ef_unpack_m_vec(row_len, packed_A + i * row_len, temp);
+ for (int j = 0; j < ncols; j++) {
+ A[i * ncols + j] = temp[j];
+ }
+ }
+
+ mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15);
+ mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+ mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+ mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/mayo.c b/src/sig/mayo/pqmayo_mayo-2_opt/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include
+#include
+#include
+#include
+#include
+#include
+#include
+#include