diff --git a/.CMake/alg_support.cmake b/.CMake/alg_support.cmake
index 73e972dd70..7309c800f3 100644
--- a/.CMake/alg_support.cmake
+++ b/.CMake/alg_support.cmake
@@ -166,6 +166,12 @@ cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_192f_simple "" ON "OQS_ENABL
 cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_192s_simple "" ON "OQS_ENABLE_SIG_SPHINCS" OFF)
 cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_256f_simple "" ON "OQS_ENABLE_SIG_SPHINCS" OFF)
 cmake_dependent_option(OQS_ENABLE_SIG_sphincs_shake_256s_simple "" ON "OQS_ENABLE_SIG_SPHINCS" OFF)
+
+option(OQS_ENABLE_SIG_MAYO "Enable mayo algorithm family" ON)
+cmake_dependent_option(OQS_ENABLE_SIG_mayo_1 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
+cmake_dependent_option(OQS_ENABLE_SIG_mayo_2 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
+cmake_dependent_option(OQS_ENABLE_SIG_mayo_3 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
+cmake_dependent_option(OQS_ENABLE_SIG_mayo_5 "" ON "OQS_ENABLE_SIG_MAYO" OFF)
 ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ENABLE_BY_ALG_END
 
 if((OQS_MINIMAL_BUILD STREQUAL "ON"))
@@ -184,6 +190,8 @@ elseif (${OQS_ALGS_ENABLED} STREQUAL "STD")
 ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_LIST_STANDARDIZED_ALGS_END
 elseif(${OQS_ALGS_ENABLED} STREQUAL "NIST_R4")
 	filter_algs("KEM_classic_mceliece_348864;KEM_classic_mceliece_348864f;KEM_classic_mceliece_460896;KEM_classic_mceliece_460896f;KEM_classic_mceliece_6688128;KEM_classic_mceliece_6688128f;KEM_classic_mceliece_6960119;KEM_classic_mceliece_6960119f;KEM_classic_mceliece_8192128;KEM_classic_mceliece_8192128f;KEM_hqc_128;KEM_hqc_192;KEM_hqc_256;KEM_bike_l1;KEM_bike_l3;KEM_bike_l5")
+elseif(${OQS_ALGS_ENABLED} STREQUAL "NIST_SIG_ONRAMP")
+    filter_algs("SIG_mayo_1;SIG_mayo_2;SIG_mayo_3;SIG_mayo_5")
 else()
 	message(STATUS "Alg enablement unchanged")
 endif()
@@ -495,6 +503,31 @@ if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
 endif()
 endif()
 
+
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+    cmake_dependent_option(OQS_ENABLE_SIG_mayo_1_avx2 "" ON "OQS_ENABLE_SIG_mayo_1" OFF)
+endif()
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+    cmake_dependent_option(OQS_ENABLE_SIG_mayo_2_avx2 "" ON "OQS_ENABLE_SIG_mayo_2" OFF)
+endif()
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+    cmake_dependent_option(OQS_ENABLE_SIG_mayo_3_avx2 "" ON "OQS_ENABLE_SIG_mayo_3" OFF)
+endif()
+endif()
+
+if(CMAKE_SYSTEM_NAME MATCHES "Darwin|Linux")
+if(OQS_DIST_X86_64_BUILD OR (OQS_USE_AVX2_INSTRUCTIONS))
+    cmake_dependent_option(OQS_ENABLE_SIG_mayo_5_avx2 "" ON "OQS_ENABLE_SIG_mayo_5" OFF)
+endif()
+endif()
+
 ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ENABLE_BY_ALG_CONDITIONAL_END
 
 option(OQS_ENABLE_SIG_STFL_XMSS "Enable XMSS algorithm family" OFF)
diff --git a/.github/workflows/release-test.yml b/.github/workflows/release-test.yml
index 47957f4d20..2a4addd541 100644
--- a/.github/workflows/release-test.yml
+++ b/.github/workflows/release-test.yml
@@ -17,7 +17,7 @@ on:
 
 jobs:
   oqs-provider-release-test:
-    if: github.event_name == 'release' || endsWith( github.event.head_commit.message, '[trigger downstream]' )
+    if: github.event_name == 'release' || contains( github.event.head_commit.message, '[trigger downstream]' )
     runs-on: ubuntu-latest
     steps:
       - name: Checkout release tests script
diff --git a/.github/workflows/unix.yml b/.github/workflows/unix.yml
index 5882d9bc8f..49d520eaee 100644
--- a/.github/workflows/unix.yml
+++ b/.github/workflows/unix.yml
@@ -112,6 +112,11 @@ jobs:
             container: openquantumsafe/ci-ubuntu-focal-x86_64:latest
             CMAKE_ARGS: -DOQS_STRICT_WARNINGS=ON -DOQS_ALGS_ENABLED=NIST_R4
             PYTEST_ARGS: --ignore=tests/test_leaks.py --ignore=tests/test_kat_all.py
+          - name: focal-nistonramp-openssl
+            runner: ubuntu-latest
+            container: openquantumsafe/ci-ubuntu-focal-x86_64:latest
+            CMAKE_ARGS: -DOQS_STRICT_WARNINGS=ON -DOQS_ALGS_ENABLED=NIST_SIG_ONRAMP
+            PYTEST_ARGS: --ignore=tests/test_leaks.py --ignore=tests/test_kat_all.py
           - name: jammy-std-openssl3
             runner: ubuntu-latest
             container: openquantumsafe/ci-ubuntu-jammy:latest
@@ -256,7 +261,7 @@ jobs:
       - name: Install dependencies
         run: env HOMEBREW_NO_AUTO_UPDATE=1 brew install ninja && pip3 install --require-hashes --break-system-packages -r .github/workflows/requirements.txt
       - name: Patch GCC
-        run: env HOMEBREW_NO_AUTO_UPDATE=1 brew uninstall --ignore-dependencies gcc@13 && wget https://raw.githubusercontent.com/Homebrew/homebrew-core/eb6dd225d093b66054e18e07d56509cf670793b1/Formula/g/gcc%4013.rb && env HOMEBREW_NO_AUTO_UPDATE=1 brew install --ignore-dependencies gcc@13.rb
+        run: env HOMEBREW_NO_AUTO_UPDATE=1 brew uninstall --ignore-dependencies gcc@13 && wget https://raw.githubusercontent.com/Homebrew/homebrew-core/eb6dd225d093b66054e18e07d56509cf670793b1/Formula/g/gcc%4013.rb && env HOMEBREW_NO_AUTO_UPDATE=1 brew install --ignore-dependencies --formula gcc@13.rb
       - name: Get system information
         run: sysctl -a | grep machdep.cpu
       - name: Configure
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0524a07c5b..ebbd58962f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -200,6 +200,9 @@ endif()
 if(OQS_ENABLE_SIG_SPHINCS)
     set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/sig/sphincs/sig_sphincs.h)
 endif()
+if(OQS_ENABLE_SIG_MAYO)
+    set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/sig/mayo/sig_mayo.h)
+endif()
 ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_INCLUDE_HEADERS_END
 if(OQS_ENABLE_SIG_STFL_XMSS)
     set(PUBLIC_HEADERS ${PUBLIC_HEADERS} ${PROJECT_SOURCE_DIR}/src/sig_stfl/xmss/sig_stfl_xmss.h)
diff --git a/CONFIGURE.md b/CONFIGURE.md
index 9bae9f5af2..d22c9fa34f 100644
--- a/CONFIGURE.md
+++ b/CONFIGURE.md
@@ -58,9 +58,9 @@ For a full list of such options and their default values, consult [.CMake/alg_su
 
 ## OQS_ALGS_ENABLED
 
-A selected algorithm set is enabled. Possible values are "STD" selecting all algorithms standardized by NIST; "NIST_R4" selecting all algorithms evaluated in round 4 of the NIST PQC competition; "All" (or any other value) selecting all algorithms integrated into liboqs. Parameter setting "STD" minimizes library size but may require re-running code generator scripts in projects integrating `liboqs`; e.g., [oqs-provider](https://github.com/open-quantum-safe/oqs-provider) and [oqs-boringssl](https://github.com/open-quantum-safe/boringssl).
+A selected algorithm set is enabled. Possible values are "STD" selecting all algorithms standardized by NIST; "NIST_R4" selecting all algorithms evaluated in round 4 of the NIST PQC competition; "NIST_SIG_ONRAMP" selecting algorithms evaluated in the NIST PQC "onramp" standardization for additional signature schemes; "All" (or any other value) selecting all algorithms integrated into liboqs. Parameter setting "STD" minimizes library size but may require re-running code generator scripts in projects integrating `liboqs`; e.g., [oqs-provider](https://github.com/open-quantum-safe/oqs-provider) and [oqs-boringssl](https://github.com/open-quantum-safe/boringssl).
 
-**Attention**: If you use any predefined value (`STD` or `NIST_R4` as of now) for this variable, the values added via [OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG](#OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG) variables will be ignored.
+**Attention**: If you use any predefined value (`STD` or `NIST_R4` or `NIST_SIG_ONRAMP` as of now) for this variable, the values added via [OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG](#OQS_ENABLE_KEM_ALG/OQS_ENABLE_SIG_ALG/OQS_ENABLE_SIG_STFL_ALG) variables will be ignored.
 
 **Default**: `All`.
 
diff --git a/PLATFORMS.md b/PLATFORMS.md
index 60f695d886..e2220229ae 100644
--- a/PLATFORMS.md
+++ b/PLATFORMS.md
@@ -62,4 +62,3 @@ In this policy, the words "must" and "must not" specify absolute requirements th
 - x86 for Windows (Visual Studio Toolchain)
 - ppc64le for Ubuntu (Focal)
 - s390x for Ubuntu (Focal)
-
diff --git a/README.md b/README.md
index 2b8122b4d7..b21281e2cf 100644
--- a/README.md
+++ b/README.md
@@ -65,6 +65,7 @@ All names other than `ML-KEM` and `ML-DSA` are subject to change. `liboqs` makes
 <!--- OQS_TEMPLATE_FRAGMENT_LIST_SIGS_START -->
 - **CRYSTALS-Dilithium**: Dilithium2, Dilithium3, Dilithium5
 - **Falcon**: Falcon-512, Falcon-1024, Falcon-padded-512, Falcon-padded-1024
+- **MAYO**: MAYO-1, MAYO-2, MAYO-3, MAYO-5†
 - **ML-DSA**: ML-DSA-44-ipd (alias: ML-DSA-44), ML-DSA-65-ipd (alias: ML-DSA-65), ML-DSA-87-ipd (alias: ML-DSA-87)
 - **SPHINCS+-SHA2**: SPHINCS+-SHA2-128f-simple, SPHINCS+-SHA2-128s-simple, SPHINCS+-SHA2-192f-simple, SPHINCS+-SHA2-192s-simple, SPHINCS+-SHA2-256f-simple, SPHINCS+-SHA2-256s-simple
 - **SPHINCS+-SHAKE**: SPHINCS+-SHAKE-128f-simple, SPHINCS+-SHAKE-128s-simple, SPHINCS+-SHAKE-192f-simple, SPHINCS+-SHAKE-192s-simple, SPHINCS+-SHAKE-256f-simple, SPHINCS+-SHAKE-256s-simple
@@ -197,6 +198,7 @@ liboqs includes some third party libraries or modules that are licensed differen
 - `src/sig/dilithium/pqcrystals-*`: public domain (CC0) or Apache License v2.0
 - `src/sig/dilithium/pqclean_*`: public domain (CC0), and public domain (CC0) or Apache License v2.0, and public domain (CC0) or MIT, and MIT
 -  src/sig/falcon/pqclean_\*\_aarch64 : Apache License v2.0
+- `src/sig/mayo/*`: Apache License v2.0
 - `src/sig/ml_dsa/pqcrystals-*`: public domain (CC0) or Apache License v2.0
 - `src/sig/sphincs/pqclean_*`: CC0 (public domain)
 
diff --git a/docs/algorithms/sig/mayo.md b/docs/algorithms/sig/mayo.md
new file mode 100644
index 0000000000..3174058f13
--- /dev/null
+++ b/docs/algorithms/sig/mayo.md
@@ -0,0 +1,62 @@
+# MAYO
+
+- **Algorithm type**: Digital signature scheme.
+- **Main cryptographic assumption**: multivariable quadratic equations, oil and vinegar.
+- **Principal submitters**: Ward Beullens, Fabio Campos, Sofía Celi, Basil Hess, Matthias J. Kannwischer.
+- **Authors' website**: https://pqmayo.org
+- **Specification version**: https://doi.org/10.46586/tches.v2024.i2.252-275.
+- **Primary Source**<a name="primary-source"></a>:
+  - **Source**: https://github.com/PQCMayo/MAYO-C/commit/cde2675ff404b0ae070e7dbc3d962ea0b026a81e with copy_from_upstream patches
+  - **Implementation license (SPDX-Identifier)**: Apache-2.0
+
+
+## Parameter set summary
+
+|  Parameter set  | Parameter set alias   | Security model   |   Claimed NIST Level |   Public key size (bytes) |   Secret key size (bytes) |   Signature size (bytes) |
+|:---------------:|:----------------------|:-----------------|---------------------:|--------------------------:|--------------------------:|-------------------------:|
+|     MAYO-1      | NA                    | EUF-CMA          |                    1 |                      1168 |                        24 |                      321 |
+|     MAYO-2      | NA                    | EUF-CMA          |                    1 |                      5488 |                        24 |                      180 |
+|     MAYO-3      | NA                    | EUF-CMA          |                    3 |                      2656 |                        32 |                      577 |
+|     MAYO-5      | NA                    | EUF-CMA          |                    5 |                      5008 |                        40 |                      838 |
+
+## MAYO-1 implementation characteristics
+
+|       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?‡   |
+|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:----------------------|
+| [Primary Source](#primary-source) | opt                      | All                         | All                             | None                    | True                               | True                                           | False                 |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Darwin,Linux                    | AVX2                    | True                               | True                                           | False                 |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+ ‡For an explanation of what this denotes, consult the [Explanation of Terms](#explanation-of-terms) section at the end of this file.
+
+## MAYO-2 implementation characteristics
+
+|       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
+|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+| [Primary Source](#primary-source) | opt                      | All                         | All                             | None                    | True                               | True                                           | False                |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Darwin,Linux                    | AVX2                    | True                               | True                                           | False                |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+## MAYO-3 implementation characteristics
+
+|       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
+|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+| [Primary Source](#primary-source) | opt                      | All                         | All                             | None                    | True                               | True                                           | False                |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Darwin,Linux                    | AVX2                    | True                               | True                                           | False                |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+## MAYO-5 implementation characteristics
+
+|       Implementation source       | Identifier in upstream   | Supported architecture(s)   | Supported operating system(s)   | CPU extension(s) used   | No branching-on-secrets claimed?   | No branching-on-secrets checked by valgrind?   | Large stack usage?   |
+|:---------------------------------:|:-------------------------|:----------------------------|:--------------------------------|:------------------------|:-----------------------------------|:-----------------------------------------------|:---------------------|
+| [Primary Source](#primary-source) | opt                      | All                         | All                             | None                    | True                               | True                                           | False                |
+| [Primary Source](#primary-source) | avx2                     | x86\_64                     | Darwin,Linux                    | AVX2                    | True                               | True                                           | True                 |
+
+Are implementations chosen based on runtime CPU feature detection? **Yes**.
+
+## Explanation of Terms
+
+- **Large Stack Usage**: Implementations identified as having such may cause failures when running in threads or in constrained environments.
\ No newline at end of file
diff --git a/docs/algorithms/sig/mayo.yml b/docs/algorithms/sig/mayo.yml
new file mode 100644
index 0000000000..0d84b9381d
--- /dev/null
+++ b/docs/algorithms/sig/mayo.yml
@@ -0,0 +1,143 @@
+name: MAYO
+type: signature
+principal-submitters:
+- Ward Beullens
+- Fabio Campos
+- Sofía Celi
+- Basil Hess
+- Matthias J. Kannwischer
+crypto-assumption: multivariable quadratic equations, oil and vinegar
+website: https://pqmayo.org
+nist-round: 1
+spec-version: https://doi.org/10.46586/tches.v2024.i2.252-275
+primary-upstream:
+  source: https://github.com/PQCMayo/MAYO-C/commit/cde2675ff404b0ae070e7dbc3d962ea0b026a81e
+    with copy_from_upstream patches
+  spdx-license-identifier: Apache-2.0
+parameter-sets:
+- name: MAYO-1
+  claimed-nist-level: 1
+  claimed-security: EUF-CMA
+  length-public-key: 1168
+  length-secret-key: 24
+  length-signature: 321
+  implementations-switch-on-runtime-cpu-features: true
+  implementations:
+  - upstream: primary-upstream
+    upstream-id: opt
+    supported-platforms: all
+    common-crypto:
+    - SHA3: liboqs
+    - AES: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: avx2
+    supported-platforms:
+    - architecture: x86_64
+      operating_systems:
+      - Darwin
+      - Linux
+      required_flags:
+      - avx2
+    common-crypto:
+    - SHA3: liboqs
+    - AES: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+- name: MAYO-2
+  claimed-nist-level: 1
+  claimed-security: EUF-CMA
+  length-public-key: 5488
+  length-secret-key: 24
+  length-signature: 180
+  implementations-switch-on-runtime-cpu-features: true
+  implementations:
+  - upstream: primary-upstream
+    upstream-id: opt
+    supported-platforms: all
+    common-crypto:
+    - SHA3: liboqs
+    - AES: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: avx2
+    supported-platforms:
+    - architecture: x86_64
+      operating_systems:
+      - Darwin
+      - Linux
+      required_flags:
+      - avx2
+    common-crypto:
+    - SHA3: liboqs
+    - AES: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+- name: MAYO-3
+  claimed-nist-level: 3
+  claimed-security: EUF-CMA
+  length-public-key: 2656
+  length-secret-key: 32
+  length-signature: 577
+  implementations-switch-on-runtime-cpu-features: true
+  implementations:
+  - upstream: primary-upstream
+    upstream-id: opt
+    supported-platforms: all
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: avx2
+    supported-platforms:
+    - architecture: x86_64
+      operating_systems:
+      - Darwin
+      - Linux
+      required_flags:
+      - avx2
+    common-crypto:
+    - SHA3: liboqs
+    - AES: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+- name: MAYO-5
+  claimed-nist-level: 5
+  claimed-security: EUF-CMA
+  length-public-key: 5008
+  length-secret-key: 40
+  length-signature: 838
+  implementations-switch-on-runtime-cpu-features: true
+  implementations:
+  - upstream: primary-upstream
+    upstream-id: opt
+    supported-platforms: all
+    common-crypto:
+    - SHA3: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: false
+  - upstream: primary-upstream
+    upstream-id: avx2
+    supported-platforms:
+    - architecture: x86_64
+      operating_systems:
+      - Darwin
+      - Linux
+      required_flags:
+      - avx2
+    common-crypto:
+    - SHA3: liboqs
+    - AES: liboqs
+    no-secret-dependent-branching-claimed: true
+    no-secret-dependent-branching-checked-by-valgrind: true
+    large-stack-usage: true
diff --git a/docs/cbom.json b/docs/cbom.json
index 7dd47dc218..358fc28b39 100644
--- a/docs/cbom.json
+++ b/docs/cbom.json
@@ -1,23 +1,23 @@
 {
   "bomFormat": "CBOM",
   "specVersion": "1.4-cbom-1.0",
-  "serialNumber": "urn:uuid:b3ac0f3d-b320-4f0f-bbef-6c535c1e9874",
+  "serialNumber": "urn:uuid:004d7395-7601-44af-97dd-57c2214e5f60",
   "version": 1,
   "metadata": {
-    "timestamp": "2024-03-05T11:49:42.428605",
+    "timestamp": "2024-07-11T15:22:22.228289",
     "component": {
       "type": "library",
-      "bom-ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
+      "bom-ref": "pkg:github/open-quantum-safe/liboqs@ca5d956097e10672aaa9bb7994057bcc58291b65",
       "name": "liboqs",
-      "version": "1f393bfe3690c6ef1cac9070d166995ce4fb3e9d"
+      "version": "ca5d956097e10672aaa9bb7994057bcc58291b65"
     }
   },
   "components": [
     {
       "type": "library",
-      "bom-ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
+      "bom-ref": "pkg:github/open-quantum-safe/liboqs@ca5d956097e10672aaa9bb7994057bcc58291b65",
       "name": "liboqs",
-      "version": "1f393bfe3690c6ef1cac9070d166995ce4fb3e9d"
+      "version": "ca5d956097e10672aaa9bb7994057bcc58291b65"
     },
     {
       "type": "crypto-asset",
@@ -1539,6 +1539,166 @@
         "nistQuantumSecurityLevel": 5
       }
     },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:MAYO-1:generic",
+      "name": "MAYO",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "MAYO-1",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "generic"
+        },
+        "nistQuantumSecurityLevel": 1
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:MAYO-1:x86_64",
+      "name": "MAYO",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "MAYO-1",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "x86_64"
+        },
+        "nistQuantumSecurityLevel": 1
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:MAYO-2:generic",
+      "name": "MAYO",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "MAYO-2",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "generic"
+        },
+        "nistQuantumSecurityLevel": 1
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:MAYO-2:x86_64",
+      "name": "MAYO",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "MAYO-2",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "x86_64"
+        },
+        "nistQuantumSecurityLevel": 1
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:MAYO-3:generic",
+      "name": "MAYO",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "MAYO-3",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "generic"
+        },
+        "nistQuantumSecurityLevel": 3
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:MAYO-3:x86_64",
+      "name": "MAYO",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "MAYO-3",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "x86_64"
+        },
+        "nistQuantumSecurityLevel": 3
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:MAYO-5:generic",
+      "name": "MAYO",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "MAYO-5",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "generic"
+        },
+        "nistQuantumSecurityLevel": 5
+      }
+    },
+    {
+      "type": "crypto-asset",
+      "bom-ref": "alg:MAYO-5:x86_64",
+      "name": "MAYO",
+      "cryptoProperties": {
+        "assetType": "algorithm",
+        "algorithmProperties": {
+          "variant": "MAYO-5",
+          "primitive": "signature",
+          "implementationLevel": "softwarePlainRam",
+          "cryptoFunctions": [
+            "keygen",
+            "sign",
+            "verify"
+          ],
+          "implementationPlatform": "x86_64"
+        },
+        "nistQuantumSecurityLevel": 5
+      }
+    },
     {
       "type": "crypto-asset",
       "bom-ref": "alg:ML-DSA-44-ipd:generic",
@@ -2168,7 +2328,7 @@
   ],
   "dependencies": [
     {
-      "ref": "pkg:github/open-quantum-safe/liboqs@1f393bfe3690c6ef1cac9070d166995ce4fb3e9d",
+      "ref": "pkg:github/open-quantum-safe/liboqs@ca5d956097e10672aaa9bb7994057bcc58291b65",
       "dependsOn": [
         "alg:BIKE-L1:x86_64",
         "alg:BIKE-L3:x86_64",
@@ -2246,6 +2406,14 @@
         "alg:Falcon-padded-1024:generic",
         "alg:Falcon-padded-1024:x86_64",
         "alg:Falcon-padded-1024:armv8-a",
+        "alg:MAYO-1:generic",
+        "alg:MAYO-1:x86_64",
+        "alg:MAYO-2:generic",
+        "alg:MAYO-2:x86_64",
+        "alg:MAYO-3:generic",
+        "alg:MAYO-3:x86_64",
+        "alg:MAYO-5:generic",
+        "alg:MAYO-5:x86_64",
         "alg:ML-DSA-44-ipd:generic",
         "alg:ML-DSA-44-ipd:x86_64",
         "alg:ML-DSA-65-ipd:generic",
@@ -2843,6 +3011,68 @@
       ],
       "dependencyType": "uses"
     },
+    {
+      "ref": "alg:MAYO-1:generic",
+      "dependsOn": [
+        "alg:sha3",
+        "alg:aes"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:MAYO-1:x86_64",
+      "dependsOn": [
+        "alg:sha3",
+        "alg:aes"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:MAYO-2:generic",
+      "dependsOn": [
+        "alg:sha3",
+        "alg:aes"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:MAYO-2:x86_64",
+      "dependsOn": [
+        "alg:sha3",
+        "alg:aes"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:MAYO-3:generic",
+      "dependsOn": [
+        "alg:sha3"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:MAYO-3:x86_64",
+      "dependsOn": [
+        "alg:sha3",
+        "alg:aes"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:MAYO-5:generic",
+      "dependsOn": [
+        "alg:sha3"
+      ],
+      "dependencyType": "uses"
+    },
+    {
+      "ref": "alg:MAYO-5:x86_64",
+      "dependsOn": [
+        "alg:sha3",
+        "alg:aes"
+      ],
+      "dependencyType": "uses"
+    },
     {
       "ref": "alg:ML-DSA-44-ipd:generic",
       "dependsOn": [
diff --git a/scripts/copy_from_upstream/copy_from_upstream.yml b/scripts/copy_from_upstream/copy_from_upstream.yml
index 3417180c7c..f9582fa74f 100644
--- a/scripts/copy_from_upstream/copy_from_upstream.yml
+++ b/scripts/copy_from_upstream/copy_from_upstream.yml
@@ -53,6 +53,14 @@ upstreams:
     sig_meta_path: '{pretty_name_full}_META.yml'
     sig_scheme_path: '.'
     patches: [pqcrystals-ml_dsa_ipd.patch]
+  -
+    name: pqmayo
+    git_url: https://github.com/PQCMayo/MAYO-C.git
+    git_branch: nibbling-mayo
+    git_commit: cde2675ff404b0ae070e7dbc3d962ea0b026a81e
+    sig_meta_path: 'META/{pretty_name_full}_META.yml'
+    sig_scheme_path: '.'
+    patches: [pqmayo-aes.patch, pqmayo-mem.patch]
 kems:
   -
     name: classic_mceliece
@@ -301,3 +309,28 @@ sigs:
         pqclean_scheme: sphincs-shake-256s-simple
         pretty_name_full: SPHINCS+-SHAKE-256s-simple
         signed_msg_order: sig_then_msg
+  -
+    name: mayo
+    default_implementation: opt
+    upstream_location: pqmayo
+    schemes:
+      -
+        scheme: "1"
+        pqclean_scheme: mayo-1
+        pretty_name_full: MAYO-1
+        signed_msg_order: sig_then_msg
+      -
+        scheme: "2"
+        pqclean_scheme: mayo-2
+        pretty_name_full: MAYO-2
+        signed_msg_order: sig_then_msg
+      -
+        scheme: "3"
+        pqclean_scheme: mayo-3
+        pretty_name_full: MAYO-3
+        signed_msg_order: sig_then_msg
+      -
+        scheme: "5"
+        pqclean_scheme: mayo-5
+        pretty_name_full: MAYO-5
+        signed_msg_order: sig_then_msg
diff --git a/scripts/copy_from_upstream/patches/pqmayo-aes.patch b/scripts/copy_from_upstream/patches/pqmayo-aes.patch
new file mode 100644
index 0000000000..2dd469eed3
--- /dev/null
+++ b/scripts/copy_from_upstream/patches/pqmayo-aes.patch
@@ -0,0 +1,22 @@
+diff --git a/src/common/aes_ctr.h b/src/common/aes_ctr.h
+index c47c01e..c5fd013 100644
+--- a/src/common/aes_ctr.h
++++ b/src/common/aes_ctr.h
+@@ -16,8 +16,14 @@ int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                       const unsigned char *input, size_t inputByteLen);
+ #define AES_128_CTR AES_128_CTR_NI
+ #else
+-int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+-                const unsigned char *input, size_t inputByteLen);
++#include <aes.h>
++static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
++                const unsigned char *input, size_t inputByteLen) {
++    (void) inputByteLen;
++    uint8_t iv[12] = { 0 };
++    aes128ctr_prf(output, outputByteLen, input, iv);
++    return (int) outputByteLen;
++}
+ #endif
+ 
+ #endif
+ 
\ No newline at end of file
diff --git a/scripts/copy_from_upstream/patches/pqmayo-mem.patch b/scripts/copy_from_upstream/patches/pqmayo-mem.patch
new file mode 100644
index 0000000000..ab47b79a06
--- /dev/null
+++ b/scripts/copy_from_upstream/patches/pqmayo-mem.patch
@@ -0,0 +1,33 @@
+diff --git a/include/mem.h b/include/mem.h
+index 4695847..dc5172c 100644
+--- a/include/mem.h
++++ b/include/mem.h
+@@ -40,13 +40,16 @@ static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+     return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+ }
+ 
++#include <oqs/common.h>
+ /**
+  * Clears and frees allocated memory.
+  * 
+  * @param[out] mem Memory to be cleared and freed.
+  * @param size Size of memory to be cleared and freed.
+  */
+-void mayo_secure_free(void *mem, size_t size);
++static inline void mayo_secure_free(void *mem, size_t size) {
++    OQS_MEM_secure_free(mem, size);
++}
+ 
+ /**
+  * Clears memory.
+@@ -54,6 +57,8 @@ void mayo_secure_free(void *mem, size_t size);
+  * @param[out] mem Memory to be cleared.
+  * @param size Size of memory to be cleared.
+  */
+-void mayo_secure_clear(void *mem, size_t size);
++static inline void mayo_secure_clear(void *mem, size_t size) {
++    OQS_MEM_cleanse(mem, size);
++}
+ 
+ #endif
+\ No newline at end of file
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index a5b64fd294..25a9b74086 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -55,6 +55,10 @@ if(OQS_ENABLE_SIG_SPHINCS)
     add_subdirectory(sig/sphincs)
     set(SIG_OBJS ${SIG_OBJS} ${SPHINCS_OBJS})
 endif()
+if(OQS_ENABLE_SIG_MAYO)
+    add_subdirectory(sig/mayo)
+    set(SIG_OBJS ${SIG_OBJS} ${MAYO_OBJS})
+endif()
 ##### OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ALG_OBJECTS_END
 
 if(OQS_ENABLE_SIG_STFL_XMSS)
diff --git a/src/common/CMakeLists.txt b/src/common/CMakeLists.txt
index 73b917e07c..d82b4ea268 100644
--- a/src/common/CMakeLists.txt
+++ b/src/common/CMakeLists.txt
@@ -26,7 +26,7 @@ else()
    if (OQS_DIST_X86_64_BUILD OR OQS_USE_AES_INSTRUCTIONS)
       set(AES_IMPL ${AES_IMPL} aes/aes128_ni.c)
       set(AES_IMPL ${AES_IMPL} aes/aes256_ni.c)
-      set_source_files_properties(aes/aes128_ni.c PROPERTIES COMPILE_FLAGS -maes)
+      set_source_files_properties(aes/aes128_ni.c PROPERTIES COMPILE_FLAGS "-maes -mssse3")
       set_source_files_properties(aes/aes256_ni.c PROPERTIES COMPILE_FLAGS "-maes -mssse3")
    elseif (OQS_DIST_ARM64_V8_BUILD)
       set(AES_IMPL ${AES_IMPL} aes/aes128_armv8.c)
diff --git a/src/common/aes/aes.c b/src/common/aes/aes.c
index 3ac8794991..01e473b819 100644
--- a/src/common/aes/aes.c
+++ b/src/common/aes/aes.c
@@ -19,6 +19,18 @@ void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **schedule) {
 	callbacks->AES128_ECB_load_schedule(key, schedule);
 }
 
+void OQS_AES128_CTR_inc_init(const uint8_t *key, void **_schedule) {
+	callbacks->AES128_CTR_inc_init(key, _schedule);
+}
+
+void OQS_AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *_schedule) {
+	callbacks->AES128_CTR_inc_iv(iv, iv_len, _schedule);
+}
+
+void OQS_AES128_CTR_inc_ivu64(uint64_t iv, void *_schedule) {
+	callbacks->AES128_CTR_inc_ivu64(iv, _schedule);
+}
+
 void OQS_AES128_free_schedule(void *schedule) {
 	callbacks->AES128_free_schedule(schedule);
 }
@@ -51,6 +63,10 @@ void OQS_AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len
 	callbacks->AES128_ECB_enc_sch(plaintext, plaintext_len, schedule, ciphertext);
 }
 
+void OQS_AES128_CTR_inc_stream_iv(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+	callbacks->AES128_CTR_inc_stream_iv(iv, iv_len, schedule, out, out_len);
+}
+
 void OQS_AES256_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) {
 	callbacks->AES256_ECB_enc(plaintext, plaintext_len, key, ciphertext);
 }
diff --git a/src/common/aes/aes.h b/src/common/aes/aes.h
index 011686b3e9..d0d6d634bc 100644
--- a/src/common/aes/aes.h
+++ b/src/common/aes/aes.h
@@ -28,6 +28,37 @@ extern "C" {
  */
 void OQS_AES128_ECB_load_schedule(const uint8_t *key, void **ctx);
 
+/**
+ * Function to initialize a context and fill a key schedule given an initial key for
+ * use in CTR mode.
+ *
+ * @param key            Initial Key.
+ * @param ctx            Abstract data structure for a key schedule.
+ */
+void OQS_AES128_CTR_inc_init(const uint8_t *key, void **ctx);
+
+/**
+ * Function to fill a context given an IV for use in CTR mode.
+ *
+ * Handles a 12- or 16-byte IV.  If a 12-byte IV is given, then 4 counter
+ * bytes are initialized to all zeros.
+ *
+ * @param iv             Initialization Vector.
+ * @param iv_len         Length of the initialization vector.
+ * @param ctx            Abstract data structure for IV.
+ */
+void OQS_AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *ctx);
+
+/**
+ * Function to fill a context given an IV for use in CTR mode.
+ * Handles an 8-byte IV passed as a 64-bit unsigned integer,
+ * counter bytes are initialized to zero.
+ *
+ * @param iv             Initialization Vector as 64-bit integer.
+ * @param ctx            Abstract data structure for IV.
+ */
+void OQS_AES128_CTR_inc_ivu64(uint64_t iv, void *ctx);
+
 /**
  * Function to free a key schedule.
  *
@@ -55,6 +86,21 @@ void OQS_AES128_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, co
  */
 void OQS_AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
 
+/**
+ * AES counter mode keystream generator.  A context generated by
+ * OQS_AES128_CTR_inc_init() is passed rather then a key.
+ *
+ * Handles a 12- or 16-byte IV.  If a 12-byte IV is given, then 4 counter
+ * bytes are initialized to all zeros.
+ *
+ * @param iv       12- or 16-byte initialization vector.
+ * @param iv_len   Lengh of IV in bytes.
+ * @param ctx      Abstract data structure for a key schedule.
+ * @param out      Pointer to a block of memory which is big enough to contain out_len bytes; the result will be written here.
+ * @param out_len  Length of output bytes to generate.
+ */
+void OQS_AES128_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *ctx, uint8_t *out, size_t out_len);
+
 /**
  * Function to fill a key schedule given an initial key for use in ECB mode encryption.
  *
diff --git a/src/common/aes/aes128_armv8.c b/src/common/aes/aes128_armv8.c
index b5003ad018..292539fefa 100644
--- a/src/common/aes/aes128_armv8.c
+++ b/src/common/aes/aes128_armv8.c
@@ -3,15 +3,28 @@
 #include <assert.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 #include <oqs/common.h>
 
 #include <arm_neon.h>
 
-#define PQC_AES128_STATESIZE 88
 typedef struct {
-	uint64_t sk_exp[PQC_AES128_STATESIZE];
+	uint64_t sk_exp[22];
+	uint8_t iv[16];
 } aes128ctx;
 
+void oqs_aes128_load_iv_armv8(const uint8_t *iv, size_t iv_len, void *_schedule) {
+	aes128ctx *ctx = _schedule;
+	if (iv_len == 12) {
+		memcpy(ctx->iv, iv, 12);
+		memset(&ctx->iv[12], 0, 4);
+	} else if (iv_len == 16) {
+		memcpy(ctx->iv, iv, 16);
+	} else {
+		exit(EXIT_FAILURE);
+	}
+}
+
 // From crypto_core/aes128encrypt/dolbeau/armv8crypto
 static inline void aes128_armv8_encrypt(const unsigned char *rkeys, const unsigned char *n, unsigned char *out) {
 	uint8x16_t temp = vld1q_u8(n);
@@ -62,3 +75,64 @@ void oqs_aes128_ecb_enc_sch_armv8(const uint8_t *plaintext, const size_t plainte
 		oqs_aes128_enc_sch_block_armv8(plaintext + (16 * block), (const void *) ctx->sk_exp, ciphertext + (16 * block));
 	}
 }
+
+static uint32_t UINT32_TO_BE(const uint32_t x) {
+	union {
+		uint32_t val;
+		uint8_t bytes[4];
+	} y;
+	y.bytes[0] = (x >> 24) & 0xFF;
+	y.bytes[1] = (x >> 16) & 0xFF;
+	y.bytes[2] = (x >> 8) & 0xFF;
+	y.bytes[3] = x & 0xFF;
+	return y.val;
+}
+#define BE_TO_UINT32(n) (uint32_t)((((uint8_t *) &(n))[0] << 24) | (((uint8_t *) &(n))[1] << 16) | (((uint8_t *) &(n))[2] << 8) | (((uint8_t *) &(n))[3] << 0))
+
+
+void oqs_aes128_ctr_enc_sch_upd_blks_armv8(void *schedule, uint8_t *out, size_t out_blks) {
+	aes128ctx *ctx = (aes128ctx *) schedule;
+	uint8_t *block = ctx->iv;
+	uint32_t ctr;
+	uint32_t ctr_be;
+	memcpy(&ctr_be, &block[12], 4);
+	ctr = BE_TO_UINT32(ctr_be);
+	while (out_blks >= 1) {
+		oqs_aes128_enc_sch_block_armv8(block, schedule, out);
+		out += 16;
+		out_blks--;
+		ctr++;
+		ctr_be = UINT32_TO_BE(ctr);
+		memcpy(&block[12], (uint8_t *) &ctr_be, 4);
+	}
+}
+
+void oqs_aes128_ctr_enc_sch_armv8(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+	uint8_t block[16];
+	uint32_t ctr;
+	uint32_t ctr_be;
+	memcpy(block, iv, 12);
+	if (iv_len == 12) {
+		ctr = 0;
+	} else if (iv_len == 16) {
+		memcpy(&ctr_be, &iv[12], 4);
+		ctr = BE_TO_UINT32(ctr_be);
+	} else {
+		exit(EXIT_FAILURE);
+	}
+	while (out_len >= 16) {
+		ctr_be = UINT32_TO_BE(ctr);
+		memcpy(&block[12], (uint8_t *) &ctr_be, 4);
+		oqs_aes128_enc_sch_block_armv8(block, schedule, out);
+		out += 16;
+		out_len -= 16;
+		ctr++;
+	}
+	if (out_len > 0) {
+		uint8_t tmp[16];
+		ctr_be = UINT32_TO_BE(ctr);
+		memcpy(&block[12], (uint8_t *) &ctr_be, 4);
+		oqs_aes128_enc_sch_block_armv8(block, schedule, tmp);
+		memcpy(out, tmp, out_len);
+	}
+}
diff --git a/src/common/aes/aes128_ni.c b/src/common/aes/aes128_ni.c
index 0593614503..b08a3041a4 100644
--- a/src/common/aes/aes128_ni.c
+++ b/src/common/aes/aes128_ni.c
@@ -5,9 +5,16 @@
 #include <assert.h>
 #include <stdint.h>
 #include <stdio.h>
+#include <string.h>
 #include <oqs/common.h>
 
 #include <wmmintrin.h>
+#include <tmmintrin.h>
+
+typedef struct {
+	__m128i sk_exp[11];
+	__m128i iv;
+} aes128ctx;
 
 // From crypto_core/aes128ncrypt/dolbeau/aesenc-int
 static inline void aes128ni_setkey_encrypt(const unsigned char *key, __m128i rkeys[11]) {
@@ -42,21 +49,39 @@ static inline void aes128ni_setkey_encrypt(const unsigned char *key, __m128i rke
 }
 
 void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule) {
-	*_schedule = malloc(11 * sizeof(__m128i));
+	*_schedule = malloc(sizeof(aes128ctx));
+	OQS_EXIT_IF_NULLPTR(*_schedule, "AES");
 	assert(*_schedule != NULL);
-	__m128i *schedule = (__m128i *) *_schedule;
+	__m128i *schedule = ((aes128ctx *) *_schedule)->sk_exp;
 	aes128ni_setkey_encrypt(key, schedule);
 }
 
+void oqs_aes128_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule) {
+	aes128ctx *ctx = _schedule;
+	__m128i idx = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
+	if (iv_len == 12) {
+		const int32_t *ivi = (const int32_t *) iv;
+		ctx->iv = _mm_shuffle_epi8(_mm_set_epi32(0, ivi[2], ivi[1], ivi[0]), idx);
+	} else if (iv_len == 16) {
+		ctx->iv = _mm_shuffle_epi8(_mm_loadu_si128((const __m128i *)iv), idx);
+	} else {
+		exit(EXIT_FAILURE);
+	}
+}
+
+void oqs_aes128_load_iv_u64_ni(uint64_t iv, void *_schedule) {
+	aes128ctx *ctx = _schedule;
+	ctx->iv = _mm_loadl_epi64((__m128i *)&iv);
+}
+
 void oqs_aes128_free_schedule_ni(void *schedule) {
 	if (schedule != NULL) {
-		OQS_MEM_secure_free(schedule, 11 * sizeof(__m128i));
+		OQS_MEM_secure_free(schedule, sizeof(aes128ctx));
 	}
 }
 
 // From crypto_core/aes128encrypt/dolbeau/aesenc-int
-static inline void aes128ni_encrypt(const __m128i rkeys[11], const unsigned char *n, unsigned char *out) {
-	__m128i nv = _mm_loadu_si128((const __m128i *)n);
+static inline void aes128ni_encrypt(const __m128i rkeys[11], __m128i nv, unsigned char *out) {
 	__m128i temp = _mm_xor_si128(nv, rkeys[0]);
 	temp = _mm_aesenc_si128(temp, rkeys[1]);
 	temp = _mm_aesenc_si128(temp, rkeys[2]);
@@ -71,9 +96,45 @@ static inline void aes128ni_encrypt(const __m128i rkeys[11], const unsigned char
 	_mm_storeu_si128((__m128i *)(out), temp);
 }
 
+// 4x interleaved encryption
+static inline void aes128ni_encrypt_x4(const __m128i rkeys[11], __m128i n0,
+                                       __m128i n1, __m128i n2, __m128i n3,
+                                       unsigned char *out) {
+	__m128i temp0 = _mm_xor_si128(n0, rkeys[0]);
+	__m128i temp1 = _mm_xor_si128(n1, rkeys[0]);
+	__m128i temp2 = _mm_xor_si128(n2, rkeys[0]);
+	__m128i temp3 = _mm_xor_si128(n3, rkeys[0]);
+
+#define AESNENCX4(IDX)                                                         \
+  temp0 = _mm_aesenc_si128(temp0, rkeys[IDX]);                                 \
+  temp1 = _mm_aesenc_si128(temp1, rkeys[IDX]);                                 \
+  temp2 = _mm_aesenc_si128(temp2, rkeys[IDX]);                                 \
+  temp3 = _mm_aesenc_si128(temp3, rkeys[IDX])
+
+	AESNENCX4(1);
+	AESNENCX4(2);
+	AESNENCX4(3);
+	AESNENCX4(4);
+	AESNENCX4(5);
+	AESNENCX4(6);
+	AESNENCX4(7);
+	AESNENCX4(8);
+	AESNENCX4(9);
+
+	temp0 = _mm_aesenclast_si128(temp0, rkeys[10]);
+	temp1 = _mm_aesenclast_si128(temp1, rkeys[10]);
+	temp2 = _mm_aesenclast_si128(temp2, rkeys[10]);
+	temp3 = _mm_aesenclast_si128(temp3, rkeys[10]);
+
+	_mm_storeu_si128((__m128i *)(out + 0), temp0);
+	_mm_storeu_si128((__m128i *)(out + 16), temp1);
+	_mm_storeu_si128((__m128i *)(out + 32), temp2);
+	_mm_storeu_si128((__m128i *)(out + 48), temp3);
+}
+
 void oqs_aes128_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext) {
-	const __m128i *schedule = (const __m128i *) _schedule;
-	aes128ni_encrypt(schedule, plaintext, ciphertext);
+	const __m128i *schedule = ((const aes128ctx *) _schedule)->sk_exp;
+	aes128ni_encrypt(schedule, _mm_loadu_si128((const __m128i *)plaintext), ciphertext);
 }
 
 void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
@@ -82,3 +143,61 @@ void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_
 		oqs_aes128_enc_sch_block_ni(plaintext + (16 * block), schedule, ciphertext + (16 * block));
 	}
 }
+
+void oqs_aes128_ctr_enc_sch_upd_blks_ni(void *schedule, uint8_t *out, size_t out_blks) {
+	aes128ctx *ctx = (aes128ctx *) schedule;
+	const __m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
+
+	while (out_blks >= 4) {
+		__m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask);
+		__m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0)), mask);
+		__m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(2, 0)), mask);
+		__m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(ctx->iv, _mm_set_epi64x(3, 0)), mask);
+		aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
+		ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(4, 0));
+		out += 64;
+		out_blks -= 4;
+	}
+	while (out_blks >= 1) {
+		__m128i nv0 = _mm_shuffle_epi8(ctx->iv, mask);
+		aes128ni_encrypt(schedule, nv0, out);
+		ctx->iv = _mm_add_epi64(ctx->iv, _mm_set_epi64x(1, 0));
+		out += 16;
+		out_blks--;
+	}
+}
+
+void oqs_aes128_ctr_enc_sch_ni(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+	__m128i block;
+	__m128i mask = _mm_set_epi8(8, 9, 10, 11, 12, 13, 14, 15, 7, 6, 5, 4, 3, 2, 1, 0);
+	if (iv_len == 12) {
+		const int32_t *ivi = (const int32_t *) iv;
+		block = _mm_set_epi32(0, ivi[2], ivi[1], ivi[0]);
+	} else if (iv_len == 16) {
+		block = _mm_loadu_si128((const __m128i *)iv);
+	} else {
+		exit(EXIT_FAILURE);
+	}
+
+	while (out_len >= 64) {
+		__m128i nv0 = block;
+		__m128i nv1 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask);
+		__m128i nv2 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(2, 0)), mask);
+		__m128i nv3 = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(3, 0)), mask);
+		aes128ni_encrypt_x4(schedule, nv0, nv1, nv2, nv3, out);
+		block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(4, 0)), mask);
+		out += 64;
+		out_len -= 64;
+	}
+	while (out_len >= 16) {
+		aes128ni_encrypt(schedule, block, out);
+		out += 16;
+		out_len -= 16;
+		block = _mm_shuffle_epi8(_mm_add_epi64(_mm_shuffle_epi8(block, mask), _mm_set_epi64x(1, 0)), mask);
+	}
+	if (out_len > 0) {
+		uint8_t tmp[16];
+		aes128ni_encrypt(schedule, block, tmp);
+		memcpy(out, tmp, out_len);
+	}
+}
diff --git a/src/common/aes/aes_c.c b/src/common/aes/aes_c.c
index 6ee93bc76a..f2ec57a500 100644
--- a/src/common/aes/aes_c.c
+++ b/src/common/aes/aes_c.c
@@ -574,6 +574,39 @@ static void aes_ecb(unsigned char *out, const unsigned char *in, size_t nblocks,
 	}
 }
 
+static inline void aes128_ctr_upd_blks(unsigned char *out, size_t outblks, aes128ctx *ctx) {
+	uint32_t ivw[16];
+	size_t i;
+	uint32_t cc;
+	uint8_t *iv = ctx->iv;
+	uint32_t blocks = (uint32_t) outblks;
+	unsigned int nrounds = 10;
+
+	br_range_dec32le(ivw, 4, iv);
+
+	memcpy(ivw +  4, ivw, 3 * sizeof(uint32_t));
+	memcpy(ivw +  8, ivw, 3 * sizeof(uint32_t));
+	memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t));
+	cc = br_swap32(ivw[3]);
+	ivw[ 7] = br_swap32(cc + 1);
+	ivw[11] = br_swap32(cc + 2);
+	ivw[15] = br_swap32(cc + 3);
+
+	while (outblks >= 4) {
+		aes_ctr4x(out, ivw, ctx->sk_exp, nrounds);
+		out += 64;
+		outblks -= 4;
+	}
+	if (outblks > 0) {
+		unsigned char tmp[64];
+		aes_ctr4x(tmp, ivw, ctx->sk_exp, nrounds);
+		for (i = 0; i < outblks * 16; i++) {
+			out[i] = tmp[i];
+		}
+	}
+	br_enc32be(&ctx->iv[12], cc + blocks);
+}
+
 static inline void aes256_ctr_upd_blks(unsigned char *out, size_t outblks, aes256ctx *ctx) {
 	uint32_t ivw[16];
 	size_t i;
@@ -725,12 +758,48 @@ void oqs_aes128_load_schedule_no_bitslice(const uint8_t *key, void **_schedule)
 	aes_keysched_no_bitslice(schedule, (const unsigned char *) key, 16);
 }
 
+void oqs_aes128_load_iv_c(const uint8_t *iv, size_t iv_len, void *_schedule) {
+	aes128ctx *ctx = _schedule;
+	if (iv_len == 12) {
+		memcpy(ctx->iv, iv, 12);
+		memset(&ctx->iv[12], 0, 4);
+	} else if (iv_len == 16) {
+		memcpy(ctx->iv, iv, 16);
+	} else {
+		exit(EXIT_FAILURE);
+	}
+}
+
+void oqs_aes128_load_iv_u64_c(uint64_t iv, void *schedule) {
+	OQS_EXIT_IF_NULLPTR(schedule, "AES");
+	aes128ctx *ctx = (aes128ctx *) schedule;
+	ctx->iv[7] = (unsigned char)(iv >> 56);
+	ctx->iv[6] = (unsigned char)(iv >> 48);
+	ctx->iv[5] = (unsigned char)(iv >> 40);
+	ctx->iv[4] = (unsigned char)(iv >> 32);
+	ctx->iv[3] = (unsigned char)(iv >> 24);
+	ctx->iv[2] = (unsigned char)(iv >> 16);
+	ctx->iv[1] = (unsigned char)(iv >>  8);
+	ctx->iv[0] = (unsigned char)iv;
+	memset(&ctx->iv[8], 0, 8);
+}
+
 void oqs_aes128_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
 	assert(plaintext_len % 16 == 0);
 	const aes128ctx *ctx = (const aes128ctx *) schedule;
 	aes_ecb(ciphertext, plaintext, plaintext_len / 16, ctx->sk_exp, 10);
 }
 
+void oqs_aes128_ctr_enc_sch_c(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+	const aes128ctx *ctx = (const aes128ctx *) schedule;
+	aes_ctr(out, out_len, iv, iv_len, ctx->sk_exp, 10);
+}
+
+void oqs_aes128_ctr_enc_sch_upd_blks_c(void *schedule, uint8_t *out, size_t out_blks) {
+	aes128ctx *ctx = (aes128ctx *) schedule;
+	aes128_ctr_upd_blks(out, out_blks, ctx);
+}
+
 void oqs_aes256_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext) {
 	assert(plaintext_len % 16 == 0);
 	const aes256ctx *ctx = (const aes256ctx *) schedule;
diff --git a/src/common/aes/aes_impl.c b/src/common/aes/aes_impl.c
index ae9be662cf..706a5f186f 100644
--- a/src/common/aes/aes_impl.c
+++ b/src/common/aes/aes_impl.c
@@ -46,6 +46,26 @@ static void AES128_ECB_load_schedule(const uint8_t *key, void **_schedule) {
 	);
 }
 
+static void AES128_CTR_inc_init(const uint8_t *key, void **_schedule) {
+	AES128_ECB_load_schedule(key, _schedule);
+}
+
+static void AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *_schedule) {
+	C_OR_NI_OR_ARM(
+	    oqs_aes128_load_iv_c(iv, iv_len, _schedule),
+	    oqs_aes128_load_iv_ni(iv, iv_len, _schedule),
+	    oqs_aes128_load_iv_armv8(iv, iv_len, _schedule)
+	);
+}
+
+static void AES128_CTR_inc_ivu64(uint64_t iv, void *_schedule) {
+	C_OR_NI_OR_ARM(
+	    oqs_aes128_load_iv_u64_c(iv, _schedule),
+	    oqs_aes128_load_iv_u64_ni(iv, _schedule),
+	    (void) iv; (void) _schedule
+	);
+}
+
 static void AES128_free_schedule(void *schedule) {
 	C_OR_NI_OR_ARM(
 	    oqs_aes128_free_schedule_c(schedule),
@@ -107,6 +127,14 @@ static void AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_
 	);
 }
 
+static void AES128_CTR_inc_stream_iv(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+	C_OR_NI_OR_ARM(
+	    oqs_aes128_ctr_enc_sch_c(iv, iv_len, schedule, out, out_len),
+	    oqs_aes128_ctr_enc_sch_ni(iv, iv_len, schedule, out, out_len),
+	    oqs_aes128_ctr_enc_sch_armv8(iv, iv_len, schedule, out, out_len)
+	);
+}
+
 static void AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
 
 static void AES256_ECB_enc(const uint8_t *plaintext, const size_t plaintext_len, const uint8_t *key, uint8_t *ciphertext) {
@@ -141,19 +169,23 @@ static void AES256_CTR_inc_stream_blks(void *schedule, uint8_t *out, size_t out_
 }
 
 struct OQS_AES_callbacks aes_default_callbacks = {
-	AES128_ECB_load_schedule,
-	AES128_free_schedule,
-	AES128_ECB_enc,
-	AES128_ECB_enc_sch,
-	AES256_ECB_load_schedule,
-	AES256_CTR_inc_init,
-	AES256_CTR_inc_iv,
-	AES256_CTR_inc_ivu64,
-	AES256_free_schedule,
-	AES256_ECB_enc,
-	AES256_ECB_enc_sch,
-	AES256_CTR_inc_stream_iv,
-	AES256_CTR_inc_stream_blks,
+	.AES128_ECB_load_schedule = AES128_ECB_load_schedule,
+	.AES128_CTR_inc_init = AES128_CTR_inc_init,
+	.AES128_CTR_inc_iv = AES128_CTR_inc_iv,
+	.AES128_CTR_inc_ivu64 = AES128_CTR_inc_ivu64,
+	.AES128_free_schedule = AES128_free_schedule,
+	.AES128_ECB_enc = AES128_ECB_enc,
+	.AES128_ECB_enc_sch = AES128_ECB_enc_sch,
+	.AES128_CTR_inc_stream_iv = AES128_CTR_inc_stream_iv,
+	.AES256_ECB_load_schedule = AES256_ECB_load_schedule,
+	.AES256_CTR_inc_init = AES256_CTR_inc_init,
+	.AES256_CTR_inc_iv = AES256_CTR_inc_iv,
+	.AES256_CTR_inc_ivu64 = AES256_CTR_inc_ivu64,
+	.AES256_free_schedule = AES256_free_schedule,
+	.AES256_ECB_enc = AES256_ECB_enc,
+	.AES256_ECB_enc_sch = AES256_ECB_enc_sch,
+	.AES256_CTR_inc_stream_iv = AES256_CTR_inc_stream_iv,
+	.AES256_CTR_inc_stream_blks = AES256_CTR_inc_stream_blks,
 };
 
 void OQS_AES_init(void) {
diff --git a/src/common/aes/aes_local.h b/src/common/aes/aes_local.h
index 4c9942a085..a9001a2e31 100644
--- a/src/common/aes/aes_local.h
+++ b/src/common/aes/aes_local.h
@@ -3,18 +3,29 @@
 #include <stdint.h>
 
 void oqs_aes128_load_schedule_ni(const uint8_t *key, void **_schedule);
+void oqs_aes128_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule);
+void oqs_aes128_load_iv_u64_ni(uint64_t iv, void *_schedule);
 void oqs_aes128_free_schedule_ni(void *schedule);
 void oqs_aes128_enc_sch_block_ni(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext);
 void oqs_aes128_ecb_enc_sch_ni(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
+void oqs_aes128_ctr_enc_sch_ni(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len);
+void oqs_aes128_ctr_enc_sch_upd_blks_ni(void *schedule, uint8_t *out, size_t out_len);
 
 void oqs_aes128_load_schedule_c(const uint8_t *key, void **_schedule);
+void oqs_aes128_load_iv_c(const uint8_t *iv, size_t iv_len, void *_schedule);
+void oqs_aes128_load_iv_u64_c(uint64_t iv, void *_schedule);
 void oqs_aes128_free_schedule_c(void *schedule);
 void oqs_aes128_ecb_enc_sch_c(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
+void oqs_aes128_ctr_enc_sch_c(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len);
+void oqs_aes128_ctr_enc_sch_upd_blks_c(void *schedule, uint8_t *out, size_t out_len);
 
 void oqs_aes128_load_schedule_no_bitslice(const uint8_t *key, void **_schedule);
+void oqs_aes128_load_iv_armv8(const uint8_t *iv, size_t iv_len, void *_schedule);
 void oqs_aes128_free_schedule_no_bitslice(void *schedule);
 void oqs_aes128_enc_sch_block_armv8(const uint8_t *plaintext, const void *_schedule, uint8_t *ciphertext);
 void oqs_aes128_ecb_enc_sch_armv8(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
+void oqs_aes128_ctr_enc_sch_armv8(const uint8_t *iv, const size_t iv_len, const void *schedule, uint8_t *out, size_t out_len);
+void oqs_aes128_ctr_enc_sch_upd_blks_armv8(void *schedule, uint8_t *out, size_t out_blks);
 
 void oqs_aes256_load_schedule_ni(const uint8_t *key, void **_schedule);
 void oqs_aes256_load_iv_ni(const uint8_t *iv, size_t iv_len, void *_schedule);
diff --git a/src/common/aes/aes_ops.h b/src/common/aes/aes_ops.h
index 5a26f75764..a64c47d28d 100644
--- a/src/common/aes/aes_ops.h
+++ b/src/common/aes/aes_ops.h
@@ -25,6 +25,21 @@ struct OQS_AES_callbacks {
 	 */
 	void (*AES128_ECB_load_schedule)(const uint8_t *key, void **ctx);
 
+	/**
+	 * Implementation of function OQS_AES256_CTR_inc_init.
+	 */
+	void (*AES128_CTR_inc_init)(const uint8_t *key, void **ctx);
+
+	/**
+	 * Implementation of function OQS_AES256_CTR_inc_iv.
+	 */
+	void (*AES128_CTR_inc_iv)(const uint8_t *iv, size_t iv_len, void *ctx);
+
+	/**
+	 * Implementation of function OQS_AES256_CTR_inc_ivu64.
+	 */
+	void (*AES128_CTR_inc_ivu64)(uint64_t iv, void *ctx);
+
 	/**
 	 * Implementation of function OQS_AES128_free_schedule.
 	 */
@@ -40,6 +55,11 @@ struct OQS_AES_callbacks {
 	 */
 	void (*AES128_ECB_enc_sch)(const uint8_t *plaintext, const size_t plaintext_len, const void *schedule, uint8_t *ciphertext);
 
+	/**
+	* Implementation of function OQS_AES128_CTR_inc_stream_iv.
+	*/
+	void (*AES128_CTR_inc_stream_iv)(const uint8_t *iv, size_t iv_len, const void *ctx, uint8_t *out, size_t out_len);
+
 	/**
 	 * Implementation of function OQS_AES256_ECB_load_schedule.
 	 */
diff --git a/src/common/aes/aes_ossl.c b/src/common/aes/aes_ossl.c
index feaff39557..c7dc5b9445 100644
--- a/src/common/aes/aes_ossl.c
+++ b/src/common/aes/aes_ossl.c
@@ -66,6 +66,67 @@ static void AES128_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_
 	OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptFinal_ex)(ks->ctx, ciphertext, &outlen));
 }
 
+static void AES128_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
+	EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new());
+	OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
+	uint8_t iv_ctr[16];
+	if (iv_len == 12) {
+		memcpy(iv_ctr, iv, 12);
+		iv_ctr[12] = 0;
+		iv_ctr[13] = 0;
+		iv_ctr[14] = 0;
+		iv_ctr[15] = 0;
+	} else if (iv_len == 16) {
+		memcpy(iv_ctr, iv, 16);
+	} else {
+		exit(EXIT_FAILURE);
+	}
+	const struct key_schedule *ks = (const struct key_schedule *) schedule;
+	OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptInit_ex)(ctr_ctx, oqs_aes_128_ctr(), NULL, ks->key, iv_ctr));
+
+	SIZE_T_TO_INT_OR_EXIT(out_len, out_len_input_int)
+	memset(out, 0, (size_t)out_len_input_int);
+	int out_len_output;
+	OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptUpdate)(ctr_ctx, out, &out_len_output, out, out_len_input_int));
+	OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptFinal_ex)(ctr_ctx, out + out_len_output, &out_len_output));
+	OSSL_FUNC(EVP_CIPHER_CTX_free)(ctr_ctx);
+}
+
+static void AES128_CTR_inc_init(const uint8_t *key, void **schedule) {
+	*schedule = malloc(sizeof(struct key_schedule));
+	OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
+
+	struct key_schedule *ks = (struct key_schedule *) *schedule;
+	EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)();
+	OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
+
+	ks->for_ECB = 0;
+	ks->ctx = ctr_ctx;
+	memcpy(ks->key, key, 16);
+}
+
+static void AES128_CTR_inc_iv(const uint8_t *iv, size_t iv_len, void *schedule) {
+	OQS_EXIT_IF_NULLPTR(schedule, "OpenSSL");
+	struct key_schedule *ks = (struct key_schedule *) schedule;
+	if (iv_len == 12) {
+		memcpy(ks->iv, iv, 12);
+		memset(&ks->iv[12], 0, 4);
+	} else if (iv_len == 16) {
+		memcpy(ks->iv, iv, 16);
+	} else {
+		exit(EXIT_FAILURE);
+	}
+	OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptInit_ex)(ks->ctx, oqs_aes_128_ctr(), NULL, ks->key, ks->iv));
+}
+
+static void AES128_CTR_inc_ivu64(uint64_t iv, void *schedule) {
+	OQS_EXIT_IF_NULLPTR(schedule, "OpenSSL");
+	struct key_schedule *ks = (struct key_schedule *) schedule;
+	br_enc64be(ks->iv, iv);
+	memset(&ks->iv[8], 0, 8);
+	OQS_OPENSSL_GUARD(OSSL_FUNC(EVP_EncryptInit_ex)(ks->ctx, oqs_aes_128_ctr(), NULL, ks->key, ks->iv));
+}
+
 static void AES256_ECB_load_schedule(const uint8_t *key, void **schedule) {
 	*schedule = malloc(sizeof(struct key_schedule));
 	OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
@@ -79,11 +140,12 @@ static void AES256_ECB_load_schedule(const uint8_t *key, void **schedule) {
 
 static void AES256_CTR_inc_init(const uint8_t *key, void **schedule) {
 	*schedule = malloc(sizeof(struct key_schedule));
+	OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
+
 	struct key_schedule *ks = (struct key_schedule *) *schedule;
 	EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)();
-	assert(ctr_ctx != NULL);
+	OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
 
-	OQS_EXIT_IF_NULLPTR(*schedule, "OpenSSL");
 	ks->for_ECB = 0;
 	ks->ctx = ctr_ctx;
 	memcpy(ks->key, key, 32);
@@ -130,7 +192,7 @@ static void AES256_ECB_enc_sch(const uint8_t *plaintext, const size_t plaintext_
 
 static void AES256_CTR_inc_stream_iv(const uint8_t *iv, size_t iv_len, const void *schedule, uint8_t *out, size_t out_len) {
 	EVP_CIPHER_CTX *ctr_ctx = OSSL_FUNC(EVP_CIPHER_CTX_new)();
-	assert(ctr_ctx != NULL);
+	OQS_EXIT_IF_NULLPTR(ctr_ctx, "OpenSSL");
 	uint8_t iv_ctr[16];
 	if (iv_len == 12) {
 		memcpy(iv_ctr, iv, 12);
@@ -164,17 +226,21 @@ static void AES256_CTR_inc_stream_blks(void *schedule, uint8_t *out, size_t out_
 }
 
 struct OQS_AES_callbacks aes_default_callbacks = {
-	AES128_ECB_load_schedule,
-	AES128_free_schedule,
-	AES128_ECB_enc,
-	AES128_ECB_enc_sch,
-	AES256_ECB_load_schedule,
-	AES256_CTR_inc_init,
-	AES256_CTR_inc_iv,
-	AES256_CTR_inc_ivu64,
-	AES256_free_schedule,
-	AES256_ECB_enc,
-	AES256_ECB_enc_sch,
-	AES256_CTR_inc_stream_iv,
-	AES256_CTR_inc_stream_blks,
+	.AES128_ECB_load_schedule = AES128_ECB_load_schedule,
+	.AES128_CTR_inc_init = AES128_CTR_inc_init,
+	.AES128_CTR_inc_iv = AES128_CTR_inc_iv,
+	.AES128_CTR_inc_ivu64 = AES128_CTR_inc_ivu64,
+	.AES128_free_schedule = AES128_free_schedule,
+	.AES128_ECB_enc = AES128_ECB_enc,
+	.AES128_ECB_enc_sch = AES128_ECB_enc_sch,
+	.AES256_ECB_load_schedule = AES256_ECB_load_schedule,
+	.AES128_CTR_inc_stream_iv = AES128_CTR_inc_stream_iv,
+	.AES256_CTR_inc_init = AES256_CTR_inc_init,
+	.AES256_CTR_inc_iv = AES256_CTR_inc_iv,
+	.AES256_CTR_inc_ivu64 = AES256_CTR_inc_ivu64,
+	.AES256_free_schedule = AES256_free_schedule,
+	.AES256_ECB_enc = AES256_ECB_enc,
+	.AES256_ECB_enc_sch = AES256_ECB_enc_sch,
+	.AES256_CTR_inc_stream_iv = AES256_CTR_inc_stream_iv,
+	.AES256_CTR_inc_stream_blks = AES256_CTR_inc_stream_blks,
 };
diff --git a/src/common/ossl_functions.h b/src/common/ossl_functions.h
index aa0ceb127c..438ec1fafa 100644
--- a/src/common/ossl_functions.h
+++ b/src/common/ossl_functions.h
@@ -25,6 +25,7 @@ VOID_FUNC(void, EVP_MD_CTX_free, (EVP_MD_CTX *ctx), (ctx))
 FUNC(EVP_MD_CTX *, EVP_MD_CTX_new, (void), ())
 FUNC(int, EVP_MD_CTX_reset, (EVP_MD_CTX *ctx), (ctx))
 FUNC(const EVP_CIPHER *, EVP_aes_128_ecb, (void), ())
+FUNC(const EVP_CIPHER *, EVP_aes_128_ctr, (void), ())
 FUNC(const EVP_CIPHER *, EVP_aes_256_ecb, (void), ())
 FUNC(const EVP_CIPHER *, EVP_aes_256_ctr, (void), ())
 #if OPENSSL_VERSION_NUMBER >= 0x30000000L
diff --git a/src/common/ossl_helpers.c b/src/common/ossl_helpers.c
index 1c73d8b901..76dccb0ef4 100644
--- a/src/common/ossl_helpers.c
+++ b/src/common/ossl_helpers.c
@@ -18,7 +18,7 @@ static EVP_MD *sha256_ptr, *sha384_ptr, *sha512_ptr,
        *sha3_256_ptr, *sha3_384_ptr, *sha3_512_ptr,
        *shake128_ptr, *shake256_ptr;
 
-static EVP_CIPHER *aes128_ecb_ptr, *aes256_ecb_ptr, *aes256_ctr_ptr;
+static EVP_CIPHER *aes128_ecb_ptr, *aes128_ctr_ptr, *aes256_ecb_ptr, *aes256_ctr_ptr;
 
 static void fetch_ossl_objects(void) {
 	sha256_ptr = OSSL_FUNC(EVP_MD_fetch)(NULL, "SHA256", NULL);
@@ -32,12 +32,13 @@ static void fetch_ossl_objects(void) {
 	shake256_ptr = OSSL_FUNC(EVP_MD_fetch)(NULL, "SHAKE256", NULL);
 
 	aes128_ecb_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-128-ECB", NULL);
+	aes128_ctr_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-128-CTR", NULL);
 	aes256_ecb_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-256-ECB", NULL);
 	aes256_ctr_ptr = OSSL_FUNC(EVP_CIPHER_fetch)(NULL, "AES-256-CTR", NULL);
 
 	if (!sha256_ptr || !sha384_ptr || !sha512_ptr || !sha3_256_ptr ||
 	        !sha3_384_ptr || !sha3_512_ptr || !shake128_ptr || !shake256_ptr ||
-	        !aes128_ecb_ptr || !aes256_ecb_ptr || !aes256_ctr_ptr) {
+	        !aes128_ecb_ptr || !aes128_ctr_ptr || !aes256_ecb_ptr || !aes256_ctr_ptr) {
 		fprintf(stderr, "liboqs warning: OpenSSL initialization failure. Is provider for SHA, SHAKE, AES enabled?\n");
 	}
 }
@@ -61,6 +62,8 @@ static void free_ossl_objects(void) {
 	shake256_ptr = NULL;
 	OSSL_FUNC(EVP_CIPHER_free)(aes128_ecb_ptr);
 	aes128_ecb_ptr = NULL;
+	OSSL_FUNC(EVP_CIPHER_free)(aes128_ctr_ptr);
+	aes128_ctr_ptr = NULL;
 	OSSL_FUNC(EVP_CIPHER_free)(aes256_ecb_ptr);
 	aes256_ecb_ptr = NULL;
 	OSSL_FUNC(EVP_CIPHER_free)(aes256_ctr_ptr);
@@ -75,7 +78,7 @@ void oqs_ossl_destroy(void) {
 #else
 	if (sha256_ptr || sha384_ptr || sha512_ptr || sha3_256_ptr ||
 	        sha3_384_ptr || sha3_512_ptr || shake128_ptr || shake256_ptr ||
-	        aes128_ecb_ptr || aes256_ecb_ptr || aes256_ctr_ptr) {
+	        aes128_ecb_ptr || aes128_ctr_ptr || aes256_ecb_ptr || aes256_ctr_ptr) {
 		free_ossl_objects();
 	}
 #endif
@@ -235,6 +238,23 @@ const EVP_CIPHER *oqs_aes_128_ecb(void) {
 #endif
 }
 
+const EVP_CIPHER *oqs_aes_128_ctr(void) {
+#if OPENSSL_VERSION_NUMBER >= 0x30000000L
+#if defined(OQS_USE_PTHREADS)
+	if (pthread_once(&init_once_control, fetch_ossl_objects)) {
+		return NULL;
+	}
+#else
+	if (!aes128_ctr_ptr) {
+		fetch_ossl_objects();
+	}
+#endif
+	return aes128_ctr_ptr;
+#else
+	return OSSL_FUNC(EVP_aes_128_ctr)();
+#endif
+}
+
 const EVP_CIPHER *oqs_aes_256_ecb(void) {
 #if OPENSSL_VERSION_NUMBER >= 0x30000000L
 #if defined(OQS_USE_PTHREADS)
diff --git a/src/common/ossl_helpers.h b/src/common/ossl_helpers.h
index fe6d34687a..3e1bc9ff25 100644
--- a/src/common/ossl_helpers.h
+++ b/src/common/ossl_helpers.h
@@ -31,6 +31,8 @@ const EVP_MD *oqs_sha3_512(void);
 
 const EVP_CIPHER *oqs_aes_128_ecb(void);
 
+const EVP_CIPHER *oqs_aes_128_ctr(void);
+
 const EVP_CIPHER *oqs_aes_256_ecb(void);
 
 const EVP_CIPHER *oqs_aes_256_ctr(void);
diff --git a/src/common/pqclean_shims/aes.h b/src/common/pqclean_shims/aes.h
index 58ae1e67c9..dc72a9e157 100644
--- a/src/common/pqclean_shims/aes.h
+++ b/src/common/pqclean_shims/aes.h
@@ -12,6 +12,7 @@
 #define AESCTR_NONCEBYTES 12
 #define AES_BLOCKBYTES 16
 
+typedef void *aes128ctx;
 typedef void *aes256ctx;
 
 #define aes256_ecb_keyexp(r, key) OQS_AES256_ECB_load_schedule((key), (r))
@@ -43,4 +44,12 @@ static inline void aes256ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[
 	OQS_AES256_free_schedule(state);
 }
 
+static inline void aes128ctr_prf(uint8_t *out, size_t outlen, const uint8_t key[16], uint8_t nonce[12]) {
+	aes128ctx state;
+	OQS_AES128_CTR_inc_init(key, &state);
+	OQS_AES128_CTR_inc_stream_iv(nonce, 12, state, out, outlen);
+	OQS_AES128_free_schedule(state);
+}
+
+
 #endif
diff --git a/src/oqsconfig.h.cmake b/src/oqsconfig.h.cmake
index d2d01c4771..f16421fa43 100644
--- a/src/oqsconfig.h.cmake
+++ b/src/oqsconfig.h.cmake
@@ -189,6 +189,16 @@
 #cmakedefine OQS_ENABLE_SIG_sphincs_shake_256f_simple_avx2 1
 #cmakedefine OQS_ENABLE_SIG_sphincs_shake_256s_simple 1
 #cmakedefine OQS_ENABLE_SIG_sphincs_shake_256s_simple_avx2 1
+
+#cmakedefine OQS_ENABLE_SIG_MAYO 1
+#cmakedefine OQS_ENABLE_SIG_mayo_1 1
+#cmakedefine OQS_ENABLE_SIG_mayo_1_avx2 1
+#cmakedefine OQS_ENABLE_SIG_mayo_2 1
+#cmakedefine OQS_ENABLE_SIG_mayo_2_avx2 1
+#cmakedefine OQS_ENABLE_SIG_mayo_3 1
+#cmakedefine OQS_ENABLE_SIG_mayo_3_avx2 1
+#cmakedefine OQS_ENABLE_SIG_mayo_5 1
+#cmakedefine OQS_ENABLE_SIG_mayo_5_avx2 1
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ADD_ALG_ENABLE_DEFINES_END
 
 #cmakedefine OQS_ENABLE_SIG_STFL_XMSS 1
diff --git a/src/sig/mayo/CMakeLists.txt b/src/sig/mayo/CMakeLists.txt
new file mode 100644
index 0000000000..e049f71344
--- /dev/null
+++ b/src/sig/mayo/CMakeLists.txt
@@ -0,0 +1,80 @@
+# SPDX-License-Identifier: MIT
+
+# This file was generated by
+# scripts/copy_from_upstream/copy_from_upstream.py
+
+set(_MAYO_OBJS "")
+
+if(OQS_ENABLE_SIG_mayo_1)
+    add_library(mayo_1_opt OBJECT sig_mayo_1.c pqmayo_mayo-1_opt/api.c pqmayo_mayo-1_opt/arithmetic.c pqmayo_mayo-1_opt/mayo.c pqmayo_mayo-1_opt/params.c)
+    target_compile_options(mayo_1_opt PUBLIC -DMAYO_VARIANT=MAYO_1 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL)
+    target_include_directories(mayo_1_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-1_opt)
+    target_include_directories(mayo_1_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(mayo_1_opt PUBLIC -DMAYO_VARIANT=MAYO_1 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL)
+    set(_MAYO_OBJS ${_MAYO_OBJS} $<TARGET_OBJECTS:mayo_1_opt>)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_1_avx2)
+    add_library(mayo_1_avx2 OBJECT pqmayo_mayo-1_avx2/api.c pqmayo_mayo-1_avx2/arithmetic.c pqmayo_mayo-1_avx2/mayo.c pqmayo_mayo-1_avx2/params.c)
+    target_include_directories(mayo_1_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-1_avx2)
+    target_include_directories(mayo_1_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(mayo_1_avx2 PRIVATE -mavx2)
+    target_compile_options(mayo_1_avx2 PUBLIC -DMAYO_VARIANT=MAYO_1 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL)
+    set(_MAYO_OBJS ${_MAYO_OBJS} $<TARGET_OBJECTS:mayo_1_avx2>)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_2)
+    add_library(mayo_2_opt OBJECT sig_mayo_2.c pqmayo_mayo-2_opt/api.c pqmayo_mayo-2_opt/arithmetic.c pqmayo_mayo-2_opt/mayo.c pqmayo_mayo-2_opt/params.c)
+    target_compile_options(mayo_2_opt PUBLIC -DMAYO_VARIANT=MAYO_2 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL)
+    target_include_directories(mayo_2_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-2_opt)
+    target_include_directories(mayo_2_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(mayo_2_opt PUBLIC -DMAYO_VARIANT=MAYO_2 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL)
+    set(_MAYO_OBJS ${_MAYO_OBJS} $<TARGET_OBJECTS:mayo_2_opt>)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_2_avx2)
+    add_library(mayo_2_avx2 OBJECT pqmayo_mayo-2_avx2/api.c pqmayo_mayo-2_avx2/arithmetic.c pqmayo_mayo-2_avx2/mayo.c pqmayo_mayo-2_avx2/params.c)
+    target_include_directories(mayo_2_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-2_avx2)
+    target_include_directories(mayo_2_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(mayo_2_avx2 PRIVATE -mavx2)
+    target_compile_options(mayo_2_avx2 PUBLIC -DMAYO_VARIANT=MAYO_2 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL)
+    set(_MAYO_OBJS ${_MAYO_OBJS} $<TARGET_OBJECTS:mayo_2_avx2>)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_3)
+    add_library(mayo_3_opt OBJECT sig_mayo_3.c pqmayo_mayo-3_opt/api.c pqmayo_mayo-3_opt/arithmetic.c pqmayo_mayo-3_opt/mayo.c pqmayo_mayo-3_opt/params.c)
+    target_compile_options(mayo_3_opt PUBLIC -DMAYO_VARIANT=MAYO_3 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+    target_include_directories(mayo_3_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-3_opt)
+    target_include_directories(mayo_3_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(mayo_3_opt PUBLIC -DMAYO_VARIANT=MAYO_3 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+    set(_MAYO_OBJS ${_MAYO_OBJS} $<TARGET_OBJECTS:mayo_3_opt>)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_3_avx2)
+    add_library(mayo_3_avx2 OBJECT pqmayo_mayo-3_avx2/api.c pqmayo_mayo-3_avx2/arithmetic.c pqmayo_mayo-3_avx2/mayo.c pqmayo_mayo-3_avx2/params.c)
+    target_include_directories(mayo_3_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-3_avx2)
+    target_include_directories(mayo_3_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(mayo_3_avx2 PRIVATE -mavx2)
+    target_compile_options(mayo_3_avx2 PUBLIC -DMAYO_VARIANT=MAYO_3 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+    set(_MAYO_OBJS ${_MAYO_OBJS} $<TARGET_OBJECTS:mayo_3_avx2>)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_5)
+    add_library(mayo_5_opt OBJECT sig_mayo_5.c pqmayo_mayo-5_opt/api.c pqmayo_mayo-5_opt/arithmetic.c pqmayo_mayo-5_opt/mayo.c pqmayo_mayo-5_opt/params.c)
+    target_compile_options(mayo_5_opt PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+    target_include_directories(mayo_5_opt PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-5_opt)
+    target_include_directories(mayo_5_opt PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(mayo_5_opt PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_OPT -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+    set(_MAYO_OBJS ${_MAYO_OBJS} $<TARGET_OBJECTS:mayo_5_opt>)
+endif()
+
+if(OQS_ENABLE_SIG_mayo_5_avx2)
+    add_library(mayo_5_avx2 OBJECT pqmayo_mayo-5_avx2/api.c pqmayo_mayo-5_avx2/arithmetic.c pqmayo_mayo-5_avx2/mayo.c pqmayo_mayo-5_avx2/params.c)
+    target_include_directories(mayo_5_avx2 PRIVATE ${CMAKE_CURRENT_LIST_DIR}/pqmayo_mayo-5_avx2)
+    target_include_directories(mayo_5_avx2 PRIVATE ${PROJECT_SOURCE_DIR}/src/common/pqclean_shims)
+    target_compile_options(mayo_5_avx2 PRIVATE -mavx2)
+    target_compile_options(mayo_5_avx2 PUBLIC -DMAYO_VARIANT=MAYO_5 -DMAYO_BUILD_TYPE_AVX2 -DMAYO_AVX -DHAVE_RANDOMBYTES_NORETVAL -DHAVE_STACKEFFICIENT)
+    set(_MAYO_OBJS ${_MAYO_OBJS} $<TARGET_OBJECTS:mayo_5_avx2>)
+endif()
+
+set(MAYO_OBJS ${_MAYO_OBJS} PARENT_SCOPE)
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/LICENSE b/src/sig/mayo/pqmayo_mayo-1_avx2/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/NOTICE b/src/sig/mayo/pqmayo_mayo-1_avx2/NOTICE
new file mode 100644
index 0000000000..53da47c21c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/NOTICE
@@ -0,0 +1,13 @@
+Copyright 2023 the MAYO team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/aes_ctr.h b/src/sig/mayo/pqmayo_mayo-1_avx2/aes_ctr.h
new file mode 100644
index 0000000000..8e41c30f2c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/aes_ctr.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AESCTR_H
+#define AESCTR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+                   const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                      const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include <aes.h>
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen) {
+    (void) inputByteLen;
+    uint8_t iv[12] = { 0 };
+    aes128ctr_prf(output, outputByteLen, input, iv);
+    return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/api.c b/src/sig/mayo/pqmayo_mayo-1_avx2/api.c
new file mode 100644
index 0000000000..b7e2ef80ce
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_1
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+    return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk) {
+    return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk) {
+    return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk) {
+    return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk) {
+    if (siglen != CRYPTO_BYTES)
+        return -1;
+    return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/api.h b/src/sig/mayo/pqmayo_mayo-1_avx2/api.h
new file mode 100644
index 0000000000..86b7bd545d
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <mayo.h>
+
+#define CRYPTO_SECRETKEYBYTES 24
+#define CRYPTO_PUBLICKEYBYTES 1168
+#define CRYPTO_BYTES 321
+
+#define CRYPTO_ALGNAME "MAYO-1"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <arithmetic_common.h>
+#include <mem.h>
+#include <echelon_form.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+    mayo12_m_upper(m_legs, in, out, size);
+#else
+    int m_vecs_stored = 0;
+    for (int r = 0; r < size; r++) {
+        for (int c = r; c < size; c++) {
+            m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+            if (r != c) {
+                m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+            }
+            m_vecs_stored ++;
+        }
+    }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    (void) p;
+    mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    (void) p;   
+    mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    (void) p;   
+    mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+    #ifndef MAYO_VARIANT
+    const int m_legs = PARAM_m(p) / 32;
+    #else
+    (void) p;
+    #endif
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+    int 
+    bs_mat_entries_used = 0;
+    for (int r = 0; r < param_v; r++) {
+        for (int c = r; c < param_v; c++) {
+            if(c==r) {
+                bs_mat_entries_used += 1;
+                continue;
+            }
+            for (int k = 0; k < param_o; k += 1) {
+                
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+                vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k],  acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k],  acc + 6 * (r * param_o + k));
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k],  acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k],  acc + 8 * (r * param_o + k));
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k],  acc + 8 * (c * param_o + k));
+#else
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+        param_k, param_n - param_o, param_o);
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+    mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+        Y, param_k, param_n - param_o,
+        param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int m_legs = PARAM_m(p) / 32;
+    mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+    mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS) {
+    (void) m;
+#if MAYO_AVX
+    const int n = o + v;
+
+    /* Old approach which is constant time but doesn't have to be */
+    unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+    unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+    mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+    mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else 
+    NOT IMPLEMENTED
+#endif
+#else
+    const int n = o + v;
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+    mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+    mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+                           const unsigned char *y, const unsigned char *r,
+                           unsigned char *x, int k, int o, int m, int A_cols) {
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    unsigned char finished;
+    int col_upper_bound;
+    unsigned char correct_column;
+
+    // x <- r
+    for (int i = 0; i < k * o; i++) {
+        x[i] = r[i];
+    }
+
+    // compute Ar;
+    unsigned char Ar[M_MAX];
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+    }
+    mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+    // move y - Ar to last column of matrix A
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+    }
+
+    EF(A, m, k * o + 1);
+
+    // check if last row of A (excluding the last entry of y) is zero
+    unsigned char full_rank = 0;
+    for (int i = 0; i < A_cols - 1; i++) {
+        full_rank |= A[(m - 1) * A_cols + i];
+    }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+    if (full_rank == 0) {
+        return 0;
+    }
+
+    // back substitution in constant time
+    // the index of the first nonzero entry in each row is secret, which makes
+    // things less efficient
+
+    for (int row = m - 1; row >= 0; row--) {
+        finished = 0;
+        col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+        // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+        for (int col = row; col <= col_upper_bound; col++) {
+
+            // Compare two chars in constant time.
+            // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+            correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+            unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+            x[col] ^= u;
+
+            for (int i = 0; i < row; i += 8) {
+                uint64_t tmp = ( (uint64_t) A[ i    * A_cols + col] <<  0) ^ ( (uint64_t) A[(i+1) * A_cols + col] <<  8)
+                             ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+                             ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+                             ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+                tmp = mul_fx8(u, tmp);
+
+                A[ i    * A_cols + A_cols - 1] ^= (tmp      ) & 0xf;
+                A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+                A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+                A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+                A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+                A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+                A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+                A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+            }
+
+            finished = finished | correct_column;
+        }
+    }
+    return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <stdint.h>
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+    #include <shuffle_arithmetic_64.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+    #include <shuffle_arithmetic_96.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+    #include <shuffle_arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+    #include <arithmetic_64.h>
+    #include <arithmetic_96.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+    #include <arithmetic_96.h>
+    #include <arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+    #include <arithmetic_128.h>
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_128.h
new file mode 100644
index 0000000000..27b367e940
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_128.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+    out[6] = in[6];
+    out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+    acc[6] ^= in[6];
+    acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static 
+    inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_128(bins +  5 * 8, bins +  10 * 8);
+    m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+    m_vec_mul_add_x_inv_128(bins +  10 * 8, bins +  7 * 8);
+    m_vec_mul_add_x_128(bins + 12 * 8, bins +  6 * 8);
+    m_vec_mul_add_x_inv_128(bins +  7 * 8, bins +  14 * 8);
+    m_vec_mul_add_x_128(bins +  6 * 8, bins +  3 * 8);
+    m_vec_mul_add_x_inv_128(bins +  14 * 8, bins +  15 * 8);
+    m_vec_mul_add_x_128(bins +  3 * 8, bins +  8 * 8);
+    m_vec_mul_add_x_inv_128(bins +  15 * 8, bins +  13 * 8);
+    m_vec_mul_add_x_128(bins +  8 * 8, bins +  4 * 8);
+    m_vec_mul_add_x_inv_128(bins +  13 * 8, bins +  9 * 8);
+    m_vec_mul_add_x_128(bins +  4 * 8, bins +  2 * 8);
+    m_vec_mul_add_x_inv_128(bins +   9 * 8, bins +  1 * 8);
+    m_vec_mul_add_x_128(bins +  2 * 8, bins +  1 * 8);
+    vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_64.h
new file mode 100644
index 0000000000..9f7535c878
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_64.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+}
+
+static 
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_64(bins +  5 * 4, bins +  10 * 4);
+    m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+    m_vec_mul_add_x_inv_64(bins +  10 * 4, bins +  7 * 4);
+    m_vec_mul_add_x_64(bins + 12 * 4, bins +  6 * 4);
+    m_vec_mul_add_x_inv_64(bins +  7 * 4, bins +  14 * 4);
+    m_vec_mul_add_x_64(bins +  6 * 4, bins +  3 * 4);
+    m_vec_mul_add_x_inv_64(bins +  14 * 4, bins +  15 * 4);
+    m_vec_mul_add_x_64(bins +  3 * 4, bins +  8 * 4);
+    m_vec_mul_add_x_inv_64(bins +  15 * 4, bins +  13 * 4);
+    m_vec_mul_add_x_64(bins +  8 * 4, bins +  4 * 4);
+    m_vec_mul_add_x_inv_64(bins +  13 * 4, bins +  9 * 4);
+    m_vec_mul_add_x_64(bins +  4 * 4, bins +  2 * 4);
+    m_vec_mul_add_x_inv_64(bins +   9 * 4, bins +  1 * 4);
+    m_vec_mul_add_x_64(bins +  2 * 4, bins +  1 * 4);
+    vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_96.h
new file mode 100644
index 0000000000..86359679fb
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_96.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static 
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_96(bins +  5 * 6, bins +  10 * 6);
+    m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+    m_vec_mul_add_x_inv_96(bins +  10 * 6, bins +  7 * 6);
+    m_vec_mul_add_x_96(bins + 12 * 6, bins +  6 * 6);
+    m_vec_mul_add_x_inv_96(bins +  7 * 6, bins +  14 * 6);
+    m_vec_mul_add_x_96(bins +  6 * 6, bins +  3 * 6);
+    m_vec_mul_add_x_inv_96(bins +  14 * 6, bins +  15 * 6);
+    m_vec_mul_add_x_96(bins +  3 * 6, bins +  8 * 6);
+    m_vec_mul_add_x_inv_96(bins +  15 * 6, bins +  13 * 6);
+    m_vec_mul_add_x_96(bins +  8 * 6, bins +  4 * 6);
+    m_vec_mul_add_x_inv_96(bins +  13 * 6, bins +  9 * 6);
+    m_vec_mul_add_x_96(bins +  4 * 6, bins +  2 * 6);
+    m_vec_mul_add_x_inv_96(bins +   9 * 6, bins +  1 * 6);
+    m_vec_mul_add_x_96(bins +  2 * 6, bins +  1 * 6);
+    vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_common.h
new file mode 100644
index 0000000000..eeb13dc0bd
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/arithmetic_common.h
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include <stdint.h>
+#include <immintrin.h>
+
+#define K_OVER_2 ((K_MAX+1)/2)
+
+static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d,
+    0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09,
+    0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01
+};
+
+//
+// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper!
+//
+static inline __m256i tbl32_gf16_multab2( uint8_t b ) {
+
+    __m256i bx = _mm256_set1_epi16( b & 0xf );
+    __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+    const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+    const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+    const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+    const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+    __m256i mask_1  = _mm256_set1_epi16(1);
+    __m256i mask_4  = _mm256_set1_epi16(4);
+    __m256i mask_0  = _mm256_setzero_si256();
+
+    return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+           ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+           ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+           ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) {
+    return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f);
+}
+
+static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) {
+    __m256i multab_l = tbl32_gf16_multab2( b );
+    __m256i multab_h = _mm256_slli_epi16( multab_l, 4 );
+
+    return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) );
+}
+
+static 
+inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){
+    // build multiplication tables 
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        for (size_t c = 0; c < O_MAX; c+=2)
+        {
+            O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4);
+        }
+    }
+}
+
+
+static 
+inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){
+    // build multiplication tables 
+    size_t r;
+    for (size_t c = 0; c < V_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            V_multabs[K_OVER_2*c +  r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4);
+        }
+#if K_MAX % 2 == 1
+        V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]);
+#endif
+    }
+}
+
+static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = {
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, 
+    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+    0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, 
+    0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d,
+    0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, 
+    0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02,
+    0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, 
+    0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09,
+    0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, 
+    0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06,
+    0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, 
+    0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04,
+    0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, 
+    0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b,
+    0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, 
+    0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01,
+    0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, 
+    0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e,
+    0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, 
+    0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c,
+    0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, 
+    0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03,
+    0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, 
+    0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08,
+    0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, 
+    0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07,
+    0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, 
+    0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05,
+    0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a, 
+    0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a};
+
+
+static 
+inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) {
+    size_t r;
+    for (size_t c = 0; c < V_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            S1_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])) 
+                                          ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4);
+        }
+#if K_MAX % 2 == 1
+        S1_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c]));
+#endif
+    }
+}
+
+static 
+inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) {
+    // build multiplication tables 
+    size_t r;
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            S2_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) 
+                                          ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4);
+        }
+#if K_MAX % 2 == 1
+        S2_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ;
+#endif
+    }
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    uint64_t a_msb;
+    uint64_t a64 = a;
+    uint64_t b32 = b;
+    uint64_t r64 = a64 * (b32 & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 1) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 2) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 3) & 1);
+
+    return r64;
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form.h
new file mode 100644
index 0000000000..fa69de0ab2
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <immintrin.h>
+#include <stdint.h>
+
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+
+//
+// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/.
+//
+static inline __m256i tbl32_gf16_multab( uint8_t b ) {
+    __m256i bx = _mm256_set1_epi16( b & 0xf );
+    __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+    const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+    const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+    const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+    const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+    __m256i mask_1  = _mm256_set1_epi16(1);
+    __m256i mask_4  = _mm256_set1_epi16(4);
+    __m256i mask_0  = _mm256_setzero_si256();
+
+    return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+           ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+           ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+           ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+/* put matrix in row echelon form with ones on first nonzero entries in constant time*/
+static inline void EF(unsigned char *A, int _nrows, int _ncols) {
+
+    (void) _nrows;
+    (void) _ncols;
+
+    #define nrows M_MAX
+    #define ncols (K_MAX * O_MAX + 1)
+
+    #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32)
+    #define MAX_COLS (AVX_REGS_PER_ROW * 32)
+
+    __m256i _pivot_row[AVX_REGS_PER_ROW];
+    __m256i A_avx[AVX_REGS_PER_ROW* M_MAX];
+
+    unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row;
+    unsigned char* A_bytes = (unsigned char*) A_avx;
+
+    // load A in the tail of AVX2 registers
+    for (int i = 0; i < nrows; i++) {
+        for (int j = 0; j < ncols; j++)
+        {
+            A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ];
+        }
+    }
+
+    // pivot row is secret, pivot col is not
+    unsigned char inverse;
+    int pivot_row = 0;
+    int pivot_col = MAYO_MAX(MAX_COLS - ncols,0);
+    for (; pivot_col < MAX_COLS-128; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-96; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-64; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-32; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+
+    // write the matrix A back
+    for (int i = 0; i < nrows; i++) {
+        for (int j = 0; j < ncols; j++) {
+            A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j];
+        }
+    }
+    mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32);
+    mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form_loop.h
new file mode 100644
index 0000000000..b8b29741c4
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/echelon_form_loop.h
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+
+int pivot_col_rounded = pivot_col/32;
+
+int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS);
+int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols);
+/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/
+
+/* zero out pivot row */
+for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) {
+    _pivot_row[i] = _mm256_set1_epi8(0);
+}
+
+/* try to get a pivot row in constant time */
+unsigned char pivot = 0;
+uint32_t pivot_is_zero = -1;
+for (int row = pivot_row_lower_bound;
+        row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+    uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row);
+    uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row);
+    __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) );
+    for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+        _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j];
+    }
+    pivot = pivot_row_bytes[pivot_col];
+    pivot_is_zero = ~ct_compare_32((int) pivot, 0);
+}
+
+/* multiply pivot row by inverse of pivot */
+inverse = inverse_f(pivot);
+__m256i inverse_multab = tbl32_gf16_multab(inverse);
+
+for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+    _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]);
+}
+
+/* conditionally write pivot row to the correct row, if there is a nonzero pivot */
+/* eliminate entries below pivot */
+for (int row = pivot_row_lower_bound; row < nrows; row++) {
+    unsigned char below_pivot =  (unsigned char) (ct_is_greater_than(row, pivot_row));
+    unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col];
+
+    __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim);
+    if (row <= pivot_row_upper_bound) {
+        __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero);
+        for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) { 
+            A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^
+                                                    _mm256_shuffle_epi8(multab, _pivot_row[col]);
+        }
+    } else {
+        for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+            A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]);
+        }
+    }
+}
+
+pivot_row += (-(int32_t)(~pivot_is_zero));
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mem.h>
+#include <mayo.h>
+#include <randombytes.h>
+#include <aes_ctr.h>
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <fips202.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+    int i;
+    for (i = 0; i < mdeclen / 2; ++i) {
+        *mdec++ = m[i] & 0xf;
+        *mdec++ = m[i] >> 4;
+    }
+
+    if (mdeclen % 2 == 1) {
+        *mdec++ = m[i] & 0x0f;
+    }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+    int i;
+    for (i = 0; i < mlen / 2; ++i, m += 2) {
+        menc[i] = (*m) | (*(m + 1) << 4);
+    }
+
+    if (mlen % 2 == 1) {
+        menc[i] = (*m);
+    }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+
+    const uint64_t *vPv = _vPv;
+    uint64_t temp[M_MAX/16] = {0};
+    unsigned char *temp_bytes = (unsigned char *) temp;
+    int k = 0;
+    for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+        for (int j = i; j < PARAM_k(p); j++) {
+            // multiply by X (shift up 4 bits)
+            unsigned char top = temp[k] >> 60;
+            temp[k] <<= 4;
+            k--;
+            for(; k>=0; k--){
+                temp[k+1] ^= temp[k] >> 60;
+                temp[k] <<= 4;
+            }
+            // reduce mod f(X)
+            for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+                if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+                }
+                else {
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+                }
+            }
+
+            // extract from vPv and add
+            for(k=0; k < PARAM_m(p)/16; k ++){
+                temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+            }
+            k--;
+        }
+    }
+
+    // add to y
+    for (int i = 0; i < PARAM_m(p); i+=2)
+    {
+#ifdef TARGET_BIG_ENDIAN
+        y[i]   = t[i]   ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        y[i]   = t[i]   ^ (temp_bytes[i/2] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+    }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+    static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+    static const uint64_t even_bytes   = 0x00ff00ff00ff00ff;
+    static const uint64_t even_2bytes  = 0x0000ffff0000ffff;
+    static const uint64_t even_half    = 0x00000000ffffffff;
+
+    for (size_t i = 0; i < 16; i+=2)
+    {
+        uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; 
+        M[i  ] ^= t << 4;
+        M[i+1] ^= t;
+    }
+
+    for (size_t i = 0; i < 16; i+=4)
+    {
+        uint64_t t0 = ((M[i  ] >> 8) ^ M[i+2]) & even_bytes; 
+        uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; 
+        M[i  ] ^= (t0 << 8);
+        M[i+1] ^= (t1 << 8);
+        M[i+2] ^= t0;
+        M[i+3] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 4; i++)
+    {
+        uint64_t t0 = ((M[i  ] >> 16) ^ M[i+ 4]) & even_2bytes;
+        uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+        M[i   ] ^= t0 << 16;
+        M[i+ 8] ^= t1 << 16;
+        M[i+ 4] ^= t0;
+        M[i+12] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 8; i++)
+    {
+        uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; 
+        M[i  ] ^= t << 32;
+        M[i+8] ^= t;
+    }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    
+    const uint64_t *VtL = _VtL;
+    int bits_to_shift = 0;
+    int words_to_shift = 0;
+    uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+    size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+    const uint64_t *Mi, *Mj;
+
+    for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+        for (int j = PARAM_k(p) - 1; j >= i; --j) {
+            // add the M_i and M_j to A, shifted "down" by l positions
+            Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+            for (int c = 0; c < PARAM_o(p); c++) {
+                for (int k = 0; k < PARAM_m(p)/16; k++)
+                {
+                    A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+                    if(bits_to_shift > 0){
+                        A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                    }
+                }
+            }
+
+            if (i != j) {
+                Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+                for (int c = 0; c < PARAM_o(p); c++) {
+                    for (int k = 0; k < PARAM_m(p)/16; k++)
+                    {
+                        A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+                        if(bits_to_shift > 0){
+                            A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                        }
+                    }
+                }
+            }
+
+            bits_to_shift += 4;
+            if(bits_to_shift == 64){
+                words_to_shift ++;
+                bits_to_shift = 0;
+            }
+        }
+    }
+
+    for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+    {
+        transpose_16x16_nibbles(A + c);
+    }
+
+    unsigned char tab[F_TAIL_LEN*4] = {0};
+    for (size_t i = 0; i < F_TAIL_LEN; i++)
+    {
+        tab[4*i]   = mul_f(PARAM_f_tail(p)[i],1);
+        tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+        tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+        tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+    }
+
+    uint64_t low_bit_in_nibble = 0x1111111111111111;
+    
+    for (size_t c = 0; c < A_width; c+= 16)
+    {
+        for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+        {
+            size_t pos = (r/16)*A_width + c + (r%16);
+            uint64_t t0 =  A[pos]       & low_bit_in_nibble;
+            uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+            uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+            uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+            for (size_t t = 0; t < F_TAIL_LEN; t++)
+            {
+                A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+            }
+        }
+    }
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) 
+        A[i] = BSWAP64(A[i]);
+#endif
+
+    for (int r = 0; r < PARAM_m(p); r+=16)
+    {
+        for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+        {
+            for (size_t i = 0; i < 16; i++)
+            {
+                decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+            }   
+        }
+    }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+    int ret = 0;
+
+    ret = mayo_keypair_compact(p, pk, sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+    unsigned char y[M_MAX];                    // secret data
+    unsigned char salt[SALT_BYTES_MAX];        // not secret data
+    unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+             Vdec[N_MINUS_O_MAX * K_MAX];                 // secret data
+    unsigned char A[M_MAX * (K_MAX * O_MAX + 1)];   // secret data
+    unsigned char x[K_MAX * N_MAX];                       // not secret data
+    unsigned char r[K_MAX * O_MAX + 1] = { 0 };           // secret data
+    unsigned char s[K_MAX * N_MAX];                       // not secret data
+    const unsigned char *seed_sk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+    alignas(32) sk_t sk;                    // secret data
+    unsigned char Ox[N_MINUS_O_MAX];        // secret data
+    // unsigned char Mdigest[DIGEST_BYTES];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+    unsigned char *ctrbyte;
+    unsigned char *vi;
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_v_bytes = PARAM_v_bytes(p);
+    const int param_r_bytes = PARAM_r_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_A_cols = PARAM_A_cols(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    ret = mayo_expand_sk(p, csk, &sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+    seed_sk = csk;
+    decode(sk.o, O, (param_n - param_o) * param_o);
+
+    // hash message
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    uint64_t *P1 = sk.p;
+    uint64_t *L  = P1 + (param_P1_bytes/8);
+    alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        L[i] = BSWAP64(L[i]);
+    }
+#endif
+
+    // choose the randomizer
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(tmp + param_digest_bytes, param_salt_bytes);
+    #else
+    if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // hashing to salt
+    memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+           param_sk_seed_bytes);
+    shake256(salt, param_salt_bytes, tmp,
+             param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+    // hashing to t
+    memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+    ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+    shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+    decode(tenc, t, param_m); // may not be necessary
+
+    for (int ctr = 0; ctr <= 255; ++ctr) {
+        *ctrbyte = (unsigned char)ctr;
+
+        shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+                 param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+        // decode the v_i vectors
+        for (int i = 0; i <= param_k - 1; ++i) {
+            decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+                   param_n - param_o);
+        }
+
+        // compute all the V * L matrices.
+        // compute all the V * P1 * V^T matrices.
+        alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+        V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+        compute_rhs(p, Y, t, y);
+        compute_A(p, Mtmp, A);
+
+        decode(V + param_k * param_v_bytes, r,
+               param_k *
+               param_o);
+        if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+            break;
+        } else {
+            memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+        }
+    }
+
+    // s is already 0
+    // TODO: optimize this?
+    for (int i = 0; i <= param_k - 1; ++i) {
+        vi = Vdec + i * (param_n - param_o);
+        mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+        mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+        memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+    }
+    encode(s, sig, param_n * param_k);
+    memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+    *siglen = param_sig_bytes;
+err:
+    mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+    mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+    mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+    mayo_secure_clear(r, K_MAX * O_MAX + 1);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(&sk, sizeof(sk_t));
+    mayo_secure_clear(Ox, N_MINUS_O_MAX);
+    mayo_secure_clear(tmp,
+                      DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+    return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    size_t siglen = param_sig_bytes;
+    ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+    if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+        goto err;
+
+    memmove(sm + param_sig_bytes, m, mlen);
+    *smlen = siglen + mlen;
+err:
+    return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk) {
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    if (smlen < (size_t)param_sig_bytes) {
+        return MAYO_ERR;
+    }
+    int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+                             pk);
+
+    if (result == MAYO_OK) {
+        *mlen = smlen - param_sig_bytes;
+        memmove(m, sm + param_sig_bytes, *mlen);
+    }
+
+    return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char *seed_sk = csk;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+    alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+    unsigned char *seed_pk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    // seed_sk $←- B^(sk_seed bytes)
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(seed_sk, param_sk_seed_bytes);
+    #else
+    if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // S ← shake256(seedsk, pk seed bytes + O bytes)
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    // seed_pk ← s[0 : pk_seed_bytes]
+    seed_pk = S;
+
+    // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+    decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+
+    int m_legs = param_m / 32;
+
+    uint64_t *P1 = P;
+    uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+    // compute P3 = O^t * (P1*O + P2)
+    Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+    // store seed_pk in cpk
+    memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+    alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+    // compute Upper(P3) and store in cpk
+    m_upper(m_legs, P3, P3_upper, param_o);
+    
+    memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+    return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *pk) {
+    #ifdef MAYO_VARIANT
+    (void)p;
+    #endif
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+    pk += param_P1_bytes + param_P2_bytes;
+    memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+    return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *sk) {
+    int ret = MAYO_OK;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    uint64_t *P = sk->p;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_v(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    const unsigned char *seed_sk = csk;
+    unsigned char *seed_pk = S;
+
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    decode(S + param_pk_seed_bytes, O,
+           param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+    uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+    
+    uint64_t *P1 = P;
+    // compute L_i = (P1 + P1^t)*O + P2
+    uint64_t *L = P2;
+    P1P1t_times_O(p, P1, O, L);
+
+    // write to sk
+    memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+
+    mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *cpk) {
+    unsigned char tEnc[M_BYTES_MAX];
+    unsigned char t[M_MAX];
+    unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+    unsigned char s[K_MAX * N_MAX];
+    alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+    if (ret != MAYO_OK) {
+        return MAYO_ERR;
+    }
+
+    uint64_t *P1 = pk;
+    uint64_t *P2 = pk + (param_P1_bytes / 8);
+    uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        P2[i] = BSWAP64(P2[i]);
+    }
+    for (int i = 0; i < param_P3_bytes / 8; ++i) {
+        P3[i] = BSWAP64(P3[i]);
+    }
+#endif
+
+    // hash m
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    // compute t
+    memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+           param_salt_bytes);
+    shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+    decode(tEnc, t, param_m);
+
+    // decode s
+    decode(sig, s, param_k * param_n);
+
+    // Compute S*P*S^T
+    alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+    m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+                             param_v, param_o, param_k, SPS);
+
+    // combine the vectors in SPS and reduce mod f(X)
+    compute_rhs(p, SPS, y, y);
+    
+    if (memcmp(y, t, param_m) == 0) {
+        return MAYO_OK; // good signature
+    }
+    return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64                                                              \
+  { 8, 0, 2, 8, 0 } // f(z) =  z^64         + x^3*z^3 + x*z^2         + x^3
+#define F_TAIL_96                                                              \
+  { 2, 2, 0, 2, 0 } // f(z) =  z^96         +   x*z^3         + x*z   + x
+#define F_TAIL_128                                                             \
+  { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3         + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+    int m;
+    int n;
+    int o;
+    int k;
+    int q;
+    const unsigned char *f_tail;
+    int m_bytes;
+    int O_bytes;
+    int v_bytes;
+    int r_bytes;
+    int R_bytes;
+    int P1_bytes;
+    int P2_bytes;
+    int P3_bytes;
+    int csk_bytes;
+    int esk_bytes;
+    int cpk_bytes;
+    int epk_bytes;
+    int sig_bytes;
+    int salt_bytes;
+    int sk_seed_bytes;
+    int digest_bytes;
+    int pk_seed_bytes;
+    const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+    uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+    uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ * 
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/mem.h b/src/sig/mayo/pqmayo_mayo-1_avx2/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+    int32_t diff = b - a;
+    return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+    int64_t diff = ((int64_t) b) - ((int64_t) a);
+    return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+    return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+    return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+    return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include <oqs/common.h>
+/**
+ * Clears and frees allocated memory.
+ * 
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+    OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ * 
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+    OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/params.c b/src/sig/mayo/pqmayo_mayo-1_avx2/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+  const mayo_params_t nm = { \
+    .m = PARAM_JOIN2(nm, m), \
+    .n = PARAM_JOIN2(nm, n), \
+    .o = PARAM_JOIN2(nm, o), \
+    .k = PARAM_JOIN2(nm, k), \
+    .q = PARAM_JOIN2(nm, q), \
+    .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+    .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+    .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+    .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+    .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+    .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+    .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+    .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+    .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+    .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+    .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+    .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+    .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+    .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+    .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+    .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+    .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+    .name = #nm \
+  };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_128.h
new file mode 100644
index 0000000000..27b416adce
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_128.h
@@ -0,0 +1,524 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_128_H
+#define SHUFFLE_ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            cols_used ++;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); 
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[(2*r*O_MAX) + 2*k]     ^= temp[2*k]     ^ _mm256_slli_epi16(t0,4);
+            acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+            acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0;
+            acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1;
+        }
+    }
+}
+
+static 
+inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+    const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c + 1]  ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(k*O_MAX) + 2*c]      ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*O_MAX) + 2*c]     ^= temp[2*k+1] ^ t0;
+            acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1;
+        }
+    }
+}
+
+
+static
+inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        cols_used += 1;
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            pos += (V_MAX -c - 1);
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k    ));
+            __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1));
+            __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2));
+            __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3));
+
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k),     acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0);
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1);
+        }
+    }
+}
+
+
+static 
+inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *L = (__m256i *) _L;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c]     ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c + 1]     ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+        acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+static 
+inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+static 
+inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *Pv = (__m256i *) _Pv;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+
+        // P1 * S1
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            P1_cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // P2 * S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P3 = (__m256i *) _P3;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+static
+inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+    const __m256i *PS2 = (__m256i *) _PS2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_64.h
new file mode 100644
index 0000000000..defff86f8f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_64.h
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_64_H
+#define SHUFFLE_ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+    }
+}
+
+
+static 
+inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*O_MAX) + c]     ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+        }
+    }
+}
+
+static
+inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        cols_used += 1;
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + pos);
+            pos += (V_MAX -c - 1);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k    ));
+            __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1));
+
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256(acc + (r*O_MAX + k    ), acc0 ^ temp[k  ] ^ _mm256_slli_epi16(t,4));
+            _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t);
+        }
+    }
+}
+
+
+static 
+inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *L = (__m256i *) _L;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+static 
+inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *Pv = (__m256i *) _Pv;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+static 
+inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular
+// same as mayo_12_P1_times_Vt_avx2
+static
+inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+    const __m256i *PS2 = (__m256i *) _PS2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P2 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+
+        // P1 * S1
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            P1_cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // P2 * S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P3 = (__m256i *) _P3;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P3 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+static inline
+void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+    (void) size;
+    int m_vecs_stored = 0;
+
+    for (int r = 0; r < O_MAX; ++r) {
+        const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r));
+        __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+        _out[0] = _in[0];
+        m_vecs_stored++;
+        for (int c = r + 1; c < O_MAX; ++c) {
+            const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c));
+            const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r));
+            _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+            _out[0] = _in2[0] ^ _in3[0];
+            m_vecs_stored++;
+        }
+    }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_96.h
new file mode 100644
index 0000000000..9b3a69d567
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/shuffle_arithmetic_96.h
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_96_H
+#define SHUFFLE_ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){
+
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static 
+inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){
+    
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static
+inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        cols_used ++;
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            pos += (V_MAX -c - 1);
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static 
+inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){
+
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static 
+inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){
+      const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static 
+inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        // P1 times S1
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            P1_cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // P2 times S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static
+inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){
+    mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc);
+}
+
+static
+inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_avx2/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_avx2/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+    // carryless multiply
+    unsigned char p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    unsigned char top_p = p & 0xf0;
+    unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+    return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+    // carryless multiply
+    uint64_t p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+    uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+    return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+    return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+    // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+    // 10, 4, 3, 8}; return table[a & 15];
+
+    unsigned char a2 = mul_f(a, a);
+    unsigned char a4 = mul_f(a2, a2);
+    unsigned char a8 = mul_f(a4, a4);
+    unsigned char a6 = mul_f(a2, a4);
+    unsigned char a14 = mul_f(a8, a6);
+
+    return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+                                    const unsigned char *b, int n, int m) {
+    unsigned char ret = 0;
+    for (int i = 0; i < n; ++i, b += m) {
+        ret = add_f(mul_f(a[i], *b), ret);
+    }
+    return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int colrow_ab, int row_a, int col_b) {
+    for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+        for (int j = 0; j < col_b; ++j, ++c) {
+            *c = lincomb(a, b + j, colrow_ab, col_b);
+        }
+    }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int m, int n) {
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+        }
+    }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+                                 uint64_t *out) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        out[i] = in[i];
+    }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+                                uint64_t *acc) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        acc[i] ^= in[i];
+    }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+    uint32_t a_msb;
+    uint32_t a32 = a;
+    uint32_t b32 = b;
+    uint32_t r32 = a32 * (b32 & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 1) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 2) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 3) & 1);
+
+    return r32;
+
+}
+ 
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/LICENSE b/src/sig/mayo/pqmayo_mayo-1_opt/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/NOTICE b/src/sig/mayo/pqmayo_mayo-1_opt/NOTICE
new file mode 100644
index 0000000000..53da47c21c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/NOTICE
@@ -0,0 +1,13 @@
+Copyright 2023 the MAYO team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/aes_ctr.h b/src/sig/mayo/pqmayo_mayo-1_opt/aes_ctr.h
new file mode 100644
index 0000000000..8e41c30f2c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/aes_ctr.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AESCTR_H
+#define AESCTR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+                   const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                      const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include <aes.h>
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen) {
+    (void) inputByteLen;
+    uint8_t iv[12] = { 0 };
+    aes128ctr_prf(output, outputByteLen, input, iv);
+    return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/api.c b/src/sig/mayo/pqmayo_mayo-1_opt/api.c
new file mode 100644
index 0000000000..b7e2ef80ce
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_1
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+    return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk) {
+    return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk) {
+    return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk) {
+    return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk) {
+    if (siglen != CRYPTO_BYTES)
+        return -1;
+    return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/api.h b/src/sig/mayo/pqmayo_mayo-1_opt/api.h
new file mode 100644
index 0000000000..86b7bd545d
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <mayo.h>
+
+#define CRYPTO_SECRETKEYBYTES 24
+#define CRYPTO_PUBLICKEYBYTES 1168
+#define CRYPTO_BYTES 321
+
+#define CRYPTO_ALGNAME "MAYO-1"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <arithmetic_common.h>
+#include <mem.h>
+#include <echelon_form.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+    mayo12_m_upper(m_legs, in, out, size);
+#else
+    int m_vecs_stored = 0;
+    for (int r = 0; r < size; r++) {
+        for (int c = r; c < size; c++) {
+            m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+            if (r != c) {
+                m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+            }
+            m_vecs_stored ++;
+        }
+    }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    (void) p;
+    mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    (void) p;   
+    mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    (void) p;   
+    mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+    #ifndef MAYO_VARIANT
+    const int m_legs = PARAM_m(p) / 32;
+    #else
+    (void) p;
+    #endif
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+    int 
+    bs_mat_entries_used = 0;
+    for (int r = 0; r < param_v; r++) {
+        for (int c = r; c < param_v; c++) {
+            if(c==r) {
+                bs_mat_entries_used += 1;
+                continue;
+            }
+            for (int k = 0; k < param_o; k += 1) {
+                
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+                vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k],  acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k],  acc + 6 * (r * param_o + k));
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k],  acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k],  acc + 8 * (r * param_o + k));
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k],  acc + 8 * (c * param_o + k));
+#else
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+        param_k, param_n - param_o, param_o);
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+    mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+        Y, param_k, param_n - param_o,
+        param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int m_legs = PARAM_m(p) / 32;
+    mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+    mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS) {
+    (void) m;
+#if MAYO_AVX
+    const int n = o + v;
+
+    /* Old approach which is constant time but doesn't have to be */
+    unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+    unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+    mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+    mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else 
+    NOT IMPLEMENTED
+#endif
+#else
+    const int n = o + v;
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+    mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+    mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+                           const unsigned char *y, const unsigned char *r,
+                           unsigned char *x, int k, int o, int m, int A_cols) {
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    unsigned char finished;
+    int col_upper_bound;
+    unsigned char correct_column;
+
+    // x <- r
+    for (int i = 0; i < k * o; i++) {
+        x[i] = r[i];
+    }
+
+    // compute Ar;
+    unsigned char Ar[M_MAX];
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+    }
+    mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+    // move y - Ar to last column of matrix A
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+    }
+
+    EF(A, m, k * o + 1);
+
+    // check if last row of A (excluding the last entry of y) is zero
+    unsigned char full_rank = 0;
+    for (int i = 0; i < A_cols - 1; i++) {
+        full_rank |= A[(m - 1) * A_cols + i];
+    }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+    if (full_rank == 0) {
+        return 0;
+    }
+
+    // back substitution in constant time
+    // the index of the first nonzero entry in each row is secret, which makes
+    // things less efficient
+
+    for (int row = m - 1; row >= 0; row--) {
+        finished = 0;
+        col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+        // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+        for (int col = row; col <= col_upper_bound; col++) {
+
+            // Compare two chars in constant time.
+            // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+            correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+            unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+            x[col] ^= u;
+
+            for (int i = 0; i < row; i += 8) {
+                uint64_t tmp = ( (uint64_t) A[ i    * A_cols + col] <<  0) ^ ( (uint64_t) A[(i+1) * A_cols + col] <<  8)
+                             ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+                             ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+                             ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+                tmp = mul_fx8(u, tmp);
+
+                A[ i    * A_cols + A_cols - 1] ^= (tmp      ) & 0xf;
+                A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+                A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+                A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+                A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+                A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+                A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+                A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+            }
+
+            finished = finished | correct_column;
+        }
+    }
+    return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <stdint.h>
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+    #include <shuffle_arithmetic_64.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+    #include <shuffle_arithmetic_96.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+    #include <shuffle_arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+    #include <arithmetic_64.h>
+    #include <arithmetic_96.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+    #include <arithmetic_96.h>
+    #include <arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+    #include <arithmetic_128.h>
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_128.h
new file mode 100644
index 0000000000..418c308e2f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_128.h
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_64.h>
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+    out[6] = in[6];
+    out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+    acc[6] ^= in[6];
+    acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 8;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 8;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 8;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_128(bins +  5 * 8, bins +  10 * 8);
+    m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+    m_vec_mul_add_x_inv_128(bins +  10 * 8, bins +  7 * 8);
+    m_vec_mul_add_x_128(bins + 12 * 8, bins +  6 * 8);
+    m_vec_mul_add_x_inv_128(bins +  7 * 8, bins +  14 * 8);
+    m_vec_mul_add_x_128(bins +  6 * 8, bins +  3 * 8);
+    m_vec_mul_add_x_inv_128(bins +  14 * 8, bins +  15 * 8);
+    m_vec_mul_add_x_128(bins +  3 * 8, bins +  8 * 8);
+    m_vec_mul_add_x_inv_128(bins +  15 * 8, bins +  13 * 8);
+    m_vec_mul_add_x_128(bins +  8 * 8, bins +  4 * 8);
+    m_vec_mul_add_x_inv_128(bins +  13 * 8, bins +  9 * 8);
+    m_vec_mul_add_x_128(bins +  4 * 8, bins +  2 * 8);
+    m_vec_mul_add_x_inv_128(bins +   9 * 8, bins +  1 * 8);
+    m_vec_mul_add_x_128(bins +  2 * 8, bins +  1 * 8);
+    vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_64.h
new file mode 100644
index 0000000000..a70b7a3118
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_64.h
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    uint64_t a_msb;
+    uint64_t a64 = a;
+    uint64_t b32 = b;
+    uint64_t r64 = a64 * (b32 & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 1) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 2) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 3) & 1);
+
+    return r64;
+}
+
+static inline uint32_t mul_table(uint8_t b){
+    uint32_t x = ((uint32_t) b) * 0x08040201;
+
+    uint32_t high_nibble_mask = 0xf0f0f0f0;
+
+    uint32_t high_half = x & high_nibble_mask;
+    return (x ^ (high_half >> 4) ^ (high_half >> 3));
+}
+
+static 
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 4;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < legs; i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 4;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 4;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_64(bins +  5 * 4, bins +  10 * 4);
+    m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+    m_vec_mul_add_x_inv_64(bins +  10 * 4, bins +  7 * 4);
+    m_vec_mul_add_x_64(bins + 12 * 4, bins +  6 * 4);
+    m_vec_mul_add_x_inv_64(bins +  7 * 4, bins +  14 * 4);
+    m_vec_mul_add_x_64(bins +  6 * 4, bins +  3 * 4);
+    m_vec_mul_add_x_inv_64(bins +  14 * 4, bins +  15 * 4);
+    m_vec_mul_add_x_64(bins +  3 * 4, bins +  8 * 4);
+    m_vec_mul_add_x_inv_64(bins +  15 * 4, bins +  13 * 4);
+    m_vec_mul_add_x_64(bins +  8 * 4, bins +  4 * 4);
+    m_vec_mul_add_x_inv_64(bins +  13 * 4, bins +  9 * 4);
+    m_vec_mul_add_x_64(bins +  4 * 4, bins +  2 * 4);
+    m_vec_mul_add_x_inv_64(bins +   9 * 4, bins +  1 * 4);
+    m_vec_mul_add_x_64(bins +  2 * 4, bins +  1 * 4);
+    vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_96.h
new file mode 100644
index 0000000000..a38f89e454
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_96.h
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_64.h>
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 6;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 6;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 6;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_96(bins +  5 * 6, bins +  10 * 6);
+    m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+    m_vec_mul_add_x_inv_96(bins +  10 * 6, bins +  7 * 6);
+    m_vec_mul_add_x_96(bins + 12 * 6, bins +  6 * 6);
+    m_vec_mul_add_x_inv_96(bins +  7 * 6, bins +  14 * 6);
+    m_vec_mul_add_x_96(bins +  6 * 6, bins +  3 * 6);
+    m_vec_mul_add_x_inv_96(bins +  14 * 6, bins +  15 * 6);
+    m_vec_mul_add_x_96(bins +  3 * 6, bins +  8 * 6);
+    m_vec_mul_add_x_inv_96(bins +  15 * 6, bins +  13 * 6);
+    m_vec_mul_add_x_96(bins +  8 * 6, bins +  4 * 6);
+    m_vec_mul_add_x_inv_96(bins +  13 * 6, bins +  9 * 6);
+    m_vec_mul_add_x_96(bins +  4 * 6, bins +  2 * 6);
+    m_vec_mul_add_x_inv_96(bins +   9 * 6, bins +  1 * 6);
+    m_vec_mul_add_x_96(bins +  2 * 6, bins +  1 * 6);
+    vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_common.h
new file mode 100644
index 0000000000..d337bc238c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/arithmetic_common.h
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include <stdalign.h>
+
+#ifndef MAYO_VARIANT
+static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) {
+
+    m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2);
+    m_vec_add(m_legs, bins + 15 * m_legs * 2, bins +  3 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 14 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 14 * m_legs * 2, bins +  6 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2);
+    m_vec_add(m_legs, bins + 13 * m_legs * 2, bins +  7 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 12 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 12 * m_legs * 2, bins +  4 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 11 * m_legs * 2, bins +  9 * m_legs * 2);
+    m_vec_add(m_legs, bins + 11 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 10 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 10 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 9 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 9 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 7 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 7 * m_legs * 2, bins +  3 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 6 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 6 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 5 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 5 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 3 * m_legs * 2, bins +  2 * m_legs * 2);
+    m_vec_add(m_legs, bins + 3 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2);
+    m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2);
+    m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2);
+
+    m_vec_copy(m_legs, bins + 1 * m_legs * 2, out);
+}
+#endif
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *PS) {
+
+    const int n = o + v;
+#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128)
+    (void)m;
+#else
+    const int m_legs = m / 32;
+#endif
+
+    /* Old approach which is constant time but doesn't have to be
+    unsigned char S1[V_MAX*K_MAX];
+    unsigned char S2[O_MAX*K_MAX];
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2.
+    */
+
+    // use more stack efficient version for MAYO_3 and MAYO_5
+    #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78
+    uint64_t accumulator[M_MAX * N_MAX] = {0};
+    int P1_used;
+    int P3_used;
+    for (int col = 0; col < k; col++) {
+        for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) {
+            accumulator[i] = 0;
+        }
+        P1_used = 0;
+        for (int row = 0; row < v; row++) {
+            for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8);
+#else
+                bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+                P1_used ++;
+            }
+
+            for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 );
+#else
+                bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+#endif
+            }
+        }
+
+        P3_used = 0;
+        for (int row = v; row < n; row++) {
+            for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8);
+#else
+                bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+                P3_used ++;
+            }
+        }
+
+        for (int row = 0; row < n; row++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+           multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+           multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+           multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8);
+#else
+           bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2);
+#endif
+        }
+    }
+
+    #else
+
+    alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0};
+    int P1_used = 0;
+    for (int row = 0; row < v; row++) {
+        for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+            }
+#endif
+            P1_used ++;
+        }
+
+
+        for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+            }
+#endif
+        }
+    }
+
+    int P3_used = 0;
+    for (int row = v; row < n; row++) {
+        for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+            }
+#endif
+            P3_used ++;
+        }
+    }
+
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < n * k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+        multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+        multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+        multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8);
+        i++;
+#else
+        m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2);
+        i++;
+#endif
+    }
+
+    #endif
+}
+
+
+static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int  n, uint64_t *SPS){
+    alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0};
+    #if !defined(MAYO_VARIANT)
+    const int m_legs = m/32;
+    #else
+    (void) m;
+    #endif
+    for (int row = 0; row < k; row++) {
+        for (int j = 0; j < n; j++) {
+            for (int col = 0; col < k; col += 1) {
+                #if defined(MAYO_VARIANT) && (M_MAX == 64)
+                    vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 );
+                #elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                    vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 );
+                #elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                    vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 );
+                #else
+                    m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 );
+                #endif
+            }
+        }
+    }
+
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < k*k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+        multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+        multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+        multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8);
+        i++;
+#else
+        m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2);
+        i++;
+#endif
+    }
+}
+
+
+// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) {
+
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k));
+#endif
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+
+// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) {
+
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_rows; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k));
+#endif
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+
+// multiplies the transpose of a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+    for (int r = 0; r < mat_cols; r++) {
+        for (int c = 0; c < mat_rows; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+            }
+        }
+    }
+}
+
+// multiplies a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+    for (int r = 0; r < mat_rows; r++) {
+        for (int c = 0; c < mat_cols; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+            }
+        }
+    }
+}
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo-1_opt/echelon_form.h
new file mode 100644
index 0000000000..82505847c9
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/echelon_form.h
@@ -0,0 +1,152 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ECHELON_FORM_H
+#define ECHELON_FORM_H
+
+#include <stdalign.h>
+#include <stdint.h>
+#include <mem.h>
+#include <arithmetic.h>
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+static inline unsigned char
+m_extract_element(const uint64_t *in, int index) {
+    const int leg = index / 16;
+    const int offset = index % 16;
+
+    return (in[leg] >> (offset*4)) & 0xF;
+}
+
+static inline void
+ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) {
+    int i;
+    unsigned char *out8 = (unsigned char *)out;
+    for(i = 0; i+1 < ncols; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0) | (in[i+1] << 4);
+#else
+        out8[i/2]  = (in[i+0] << 0) | (in[i+1] << 4);
+#endif
+    }
+    if (ncols % 2 == 1){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0);
+#else
+        out8[i/2]  = (in[i+0] << 0);
+#endif
+    }
+}
+
+static inline void
+ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) {
+    const unsigned char *in8 = (const unsigned char *)in;
+    for(int i = 0; i < legs * 16; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out[i]   = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF;
+        out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        out[i]   = (in8[i/2]) & 0xF;
+        out[i+1] = (in8[i/2] >> 4);
+#endif
+    }
+}
+
+
+// put matrix in row echelon form with ones on first nonzero entries *in
+// constant time*
+static inline void EF(unsigned char *A, int nrows, int ncols) {
+
+    alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 };
+
+    int row_len = (ncols + 15) / 16;
+
+    // nibbleslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols);
+    }
+
+    // pivot row is secret, pivot col is not
+
+    unsigned char inverse;
+    int pivot_row = 0;
+    for (int pivot_col = 0; pivot_col < ncols; pivot_col++) {
+
+        int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols);
+        int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col);
+        // the pivot row is guaranteed to be between these lower and upper bounds if
+        // A has full rank
+
+        // zero out pivot row
+        for (int i = 0; i < row_len; i++) {
+            _pivot_row[i] = 0;
+            _pivot_row2[i] = 0;
+        }
+
+        // try to get a pivot row in constant time
+        unsigned char pivot = 0;
+        uint64_t pivot_is_zero = -1;
+        for (int row = pivot_row_lower_bound;
+                row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+
+            uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row);
+            uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row);
+
+            for (int j = 0; j < row_len; j++) {
+                _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) &
+                                 packed_A[row * row_len + j];
+            }
+            pivot = m_extract_element(_pivot_row, pivot_col);
+            pivot_is_zero = ~ct_compare_64((int) pivot, 0);
+        }
+
+        // multiply pivot row by inverse of pivot
+        inverse = inverse_f(pivot);
+        vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2);
+
+        // conditionally write pivot row to the correct row, if there is a nonzero
+        // pivot
+        for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) {
+            uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero;
+            uint64_t do_not_copy = ~do_copy;
+            for (int col = 0; col < row_len; col++) {
+                packed_A[row * row_len + col] =
+                    (do_not_copy & packed_A[row * row_len + col]) +
+                    (do_copy & _pivot_row2[col]);
+            }
+        }
+
+        // eliminate entries below pivot
+        for (int row = pivot_row_lower_bound; row < nrows; row++) {
+            unsigned char below_pivot = (row > pivot_row);
+            unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col);
+
+            vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim,
+                                    packed_A + row * row_len);                            
+        }
+
+        pivot_row += (-(int64_t)(~pivot_is_zero));
+    }
+
+    unsigned char temp[(O_MAX * K_MAX + 1 + 15)];
+
+    // unbitslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_unpack_m_vec(row_len, packed_A + i * row_len, temp);
+        for (int j = 0; j < ncols; j++) {
+            A[i * ncols + j] = temp[j];
+        }
+    }
+
+    mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15);
+    mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/mayo.c b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mem.h>
+#include <mayo.h>
+#include <randombytes.h>
+#include <aes_ctr.h>
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <fips202.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+    int i;
+    for (i = 0; i < mdeclen / 2; ++i) {
+        *mdec++ = m[i] & 0xf;
+        *mdec++ = m[i] >> 4;
+    }
+
+    if (mdeclen % 2 == 1) {
+        *mdec++ = m[i] & 0x0f;
+    }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+    int i;
+    for (i = 0; i < mlen / 2; ++i, m += 2) {
+        menc[i] = (*m) | (*(m + 1) << 4);
+    }
+
+    if (mlen % 2 == 1) {
+        menc[i] = (*m);
+    }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+
+    const uint64_t *vPv = _vPv;
+    uint64_t temp[M_MAX/16] = {0};
+    unsigned char *temp_bytes = (unsigned char *) temp;
+    int k = 0;
+    for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+        for (int j = i; j < PARAM_k(p); j++) {
+            // multiply by X (shift up 4 bits)
+            unsigned char top = temp[k] >> 60;
+            temp[k] <<= 4;
+            k--;
+            for(; k>=0; k--){
+                temp[k+1] ^= temp[k] >> 60;
+                temp[k] <<= 4;
+            }
+            // reduce mod f(X)
+            for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+                if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+                }
+                else {
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+                }
+            }
+
+            // extract from vPv and add
+            for(k=0; k < PARAM_m(p)/16; k ++){
+                temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+            }
+            k--;
+        }
+    }
+
+    // add to y
+    for (int i = 0; i < PARAM_m(p); i+=2)
+    {
+#ifdef TARGET_BIG_ENDIAN
+        y[i]   = t[i]   ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        y[i]   = t[i]   ^ (temp_bytes[i/2] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+    }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+    static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+    static const uint64_t even_bytes   = 0x00ff00ff00ff00ff;
+    static const uint64_t even_2bytes  = 0x0000ffff0000ffff;
+    static const uint64_t even_half    = 0x00000000ffffffff;
+
+    for (size_t i = 0; i < 16; i+=2)
+    {
+        uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; 
+        M[i  ] ^= t << 4;
+        M[i+1] ^= t;
+    }
+
+    for (size_t i = 0; i < 16; i+=4)
+    {
+        uint64_t t0 = ((M[i  ] >> 8) ^ M[i+2]) & even_bytes; 
+        uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; 
+        M[i  ] ^= (t0 << 8);
+        M[i+1] ^= (t1 << 8);
+        M[i+2] ^= t0;
+        M[i+3] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 4; i++)
+    {
+        uint64_t t0 = ((M[i  ] >> 16) ^ M[i+ 4]) & even_2bytes;
+        uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+        M[i   ] ^= t0 << 16;
+        M[i+ 8] ^= t1 << 16;
+        M[i+ 4] ^= t0;
+        M[i+12] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 8; i++)
+    {
+        uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; 
+        M[i  ] ^= t << 32;
+        M[i+8] ^= t;
+    }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    
+    const uint64_t *VtL = _VtL;
+    int bits_to_shift = 0;
+    int words_to_shift = 0;
+    uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+    size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+    const uint64_t *Mi, *Mj;
+
+    for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+        for (int j = PARAM_k(p) - 1; j >= i; --j) {
+            // add the M_i and M_j to A, shifted "down" by l positions
+            Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+            for (int c = 0; c < PARAM_o(p); c++) {
+                for (int k = 0; k < PARAM_m(p)/16; k++)
+                {
+                    A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+                    if(bits_to_shift > 0){
+                        A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                    }
+                }
+            }
+
+            if (i != j) {
+                Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+                for (int c = 0; c < PARAM_o(p); c++) {
+                    for (int k = 0; k < PARAM_m(p)/16; k++)
+                    {
+                        A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+                        if(bits_to_shift > 0){
+                            A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                        }
+                    }
+                }
+            }
+
+            bits_to_shift += 4;
+            if(bits_to_shift == 64){
+                words_to_shift ++;
+                bits_to_shift = 0;
+            }
+        }
+    }
+
+    for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+    {
+        transpose_16x16_nibbles(A + c);
+    }
+
+    unsigned char tab[F_TAIL_LEN*4] = {0};
+    for (size_t i = 0; i < F_TAIL_LEN; i++)
+    {
+        tab[4*i]   = mul_f(PARAM_f_tail(p)[i],1);
+        tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+        tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+        tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+    }
+
+    uint64_t low_bit_in_nibble = 0x1111111111111111;
+    
+    for (size_t c = 0; c < A_width; c+= 16)
+    {
+        for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+        {
+            size_t pos = (r/16)*A_width + c + (r%16);
+            uint64_t t0 =  A[pos]       & low_bit_in_nibble;
+            uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+            uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+            uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+            for (size_t t = 0; t < F_TAIL_LEN; t++)
+            {
+                A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+            }
+        }
+    }
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) 
+        A[i] = BSWAP64(A[i]);
+#endif
+
+    for (int r = 0; r < PARAM_m(p); r+=16)
+    {
+        for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+        {
+            for (size_t i = 0; i < 16; i++)
+            {
+                decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+            }   
+        }
+    }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+    int ret = 0;
+
+    ret = mayo_keypair_compact(p, pk, sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+    unsigned char y[M_MAX];                    // secret data
+    unsigned char salt[SALT_BYTES_MAX];        // not secret data
+    unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+             Vdec[N_MINUS_O_MAX * K_MAX];                 // secret data
+    unsigned char A[M_MAX * (K_MAX * O_MAX + 1)];   // secret data
+    unsigned char x[K_MAX * N_MAX];                       // not secret data
+    unsigned char r[K_MAX * O_MAX + 1] = { 0 };           // secret data
+    unsigned char s[K_MAX * N_MAX];                       // not secret data
+    const unsigned char *seed_sk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+    alignas(32) sk_t sk;                    // secret data
+    unsigned char Ox[N_MINUS_O_MAX];        // secret data
+    // unsigned char Mdigest[DIGEST_BYTES];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+    unsigned char *ctrbyte;
+    unsigned char *vi;
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_v_bytes = PARAM_v_bytes(p);
+    const int param_r_bytes = PARAM_r_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_A_cols = PARAM_A_cols(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    ret = mayo_expand_sk(p, csk, &sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+    seed_sk = csk;
+    decode(sk.o, O, (param_n - param_o) * param_o);
+
+    // hash message
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    uint64_t *P1 = sk.p;
+    uint64_t *L  = P1 + (param_P1_bytes/8);
+    alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        L[i] = BSWAP64(L[i]);
+    }
+#endif
+
+    // choose the randomizer
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(tmp + param_digest_bytes, param_salt_bytes);
+    #else
+    if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // hashing to salt
+    memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+           param_sk_seed_bytes);
+    shake256(salt, param_salt_bytes, tmp,
+             param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+    // hashing to t
+    memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+    ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+    shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+    decode(tenc, t, param_m); // may not be necessary
+
+    for (int ctr = 0; ctr <= 255; ++ctr) {
+        *ctrbyte = (unsigned char)ctr;
+
+        shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+                 param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+        // decode the v_i vectors
+        for (int i = 0; i <= param_k - 1; ++i) {
+            decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+                   param_n - param_o);
+        }
+
+        // compute all the V * L matrices.
+        // compute all the V * P1 * V^T matrices.
+        alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+        V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+        compute_rhs(p, Y, t, y);
+        compute_A(p, Mtmp, A);
+
+        decode(V + param_k * param_v_bytes, r,
+               param_k *
+               param_o);
+        if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+            break;
+        } else {
+            memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+        }
+    }
+
+    // s is already 0
+    // TODO: optimize this?
+    for (int i = 0; i <= param_k - 1; ++i) {
+        vi = Vdec + i * (param_n - param_o);
+        mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+        mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+        memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+    }
+    encode(s, sig, param_n * param_k);
+    memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+    *siglen = param_sig_bytes;
+err:
+    mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+    mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+    mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+    mayo_secure_clear(r, K_MAX * O_MAX + 1);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(&sk, sizeof(sk_t));
+    mayo_secure_clear(Ox, N_MINUS_O_MAX);
+    mayo_secure_clear(tmp,
+                      DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+    return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    size_t siglen = param_sig_bytes;
+    ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+    if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+        goto err;
+
+    memmove(sm + param_sig_bytes, m, mlen);
+    *smlen = siglen + mlen;
+err:
+    return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk) {
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    if (smlen < (size_t)param_sig_bytes) {
+        return MAYO_ERR;
+    }
+    int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+                             pk);
+
+    if (result == MAYO_OK) {
+        *mlen = smlen - param_sig_bytes;
+        memmove(m, sm + param_sig_bytes, *mlen);
+    }
+
+    return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char *seed_sk = csk;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+    alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+    unsigned char *seed_pk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    // seed_sk $←- B^(sk_seed bytes)
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(seed_sk, param_sk_seed_bytes);
+    #else
+    if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // S ← shake256(seedsk, pk seed bytes + O bytes)
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    // seed_pk ← s[0 : pk_seed_bytes]
+    seed_pk = S;
+
+    // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+    decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+
+    int m_legs = param_m / 32;
+
+    uint64_t *P1 = P;
+    uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+    // compute P3 = O^t * (P1*O + P2)
+    Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+    // store seed_pk in cpk
+    memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+    alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+    // compute Upper(P3) and store in cpk
+    m_upper(m_legs, P3, P3_upper, param_o);
+    
+    memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+    return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *pk) {
+    #ifdef MAYO_VARIANT
+    (void)p;
+    #endif
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+    pk += param_P1_bytes + param_P2_bytes;
+    memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+    return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *sk) {
+    int ret = MAYO_OK;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    uint64_t *P = sk->p;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_v(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    const unsigned char *seed_sk = csk;
+    unsigned char *seed_pk = S;
+
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    decode(S + param_pk_seed_bytes, O,
+           param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+    uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+    
+    uint64_t *P1 = P;
+    // compute L_i = (P1 + P1^t)*O + P2
+    uint64_t *L = P2;
+    P1P1t_times_O(p, P1, O, L);
+
+    // write to sk
+    memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+
+    mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *cpk) {
+    unsigned char tEnc[M_BYTES_MAX];
+    unsigned char t[M_MAX];
+    unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+    unsigned char s[K_MAX * N_MAX];
+    alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+    if (ret != MAYO_OK) {
+        return MAYO_ERR;
+    }
+
+    uint64_t *P1 = pk;
+    uint64_t *P2 = pk + (param_P1_bytes / 8);
+    uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        P2[i] = BSWAP64(P2[i]);
+    }
+    for (int i = 0; i < param_P3_bytes / 8; ++i) {
+        P3[i] = BSWAP64(P3[i]);
+    }
+#endif
+
+    // hash m
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    // compute t
+    memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+           param_salt_bytes);
+    shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+    decode(tEnc, t, param_m);
+
+    // decode s
+    decode(sig, s, param_k * param_n);
+
+    // Compute S*P*S^T
+    alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+    m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+                             param_v, param_o, param_k, SPS);
+
+    // combine the vectors in SPS and reduce mod f(X)
+    compute_rhs(p, SPS, y, y);
+    
+    if (memcmp(y, t, param_m) == 0) {
+        return MAYO_OK; // good signature
+    }
+    return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/mayo.h b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64                                                              \
+  { 8, 0, 2, 8, 0 } // f(z) =  z^64         + x^3*z^3 + x*z^2         + x^3
+#define F_TAIL_96                                                              \
+  { 2, 2, 0, 2, 0 } // f(z) =  z^96         +   x*z^3         + x*z   + x
+#define F_TAIL_128                                                             \
+  { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3         + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+    int m;
+    int n;
+    int o;
+    int k;
+    int q;
+    const unsigned char *f_tail;
+    int m_bytes;
+    int O_bytes;
+    int v_bytes;
+    int r_bytes;
+    int R_bytes;
+    int P1_bytes;
+    int P2_bytes;
+    int P3_bytes;
+    int csk_bytes;
+    int esk_bytes;
+    int cpk_bytes;
+    int epk_bytes;
+    int sig_bytes;
+    int salt_bytes;
+    int sk_seed_bytes;
+    int digest_bytes;
+    int pk_seed_bytes;
+    const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+    uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+    uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ * 
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/mem.h b/src/sig/mayo/pqmayo_mayo-1_opt/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+    int32_t diff = b - a;
+    return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+    int64_t diff = ((int64_t) b) - ((int64_t) a);
+    return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+    return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+    return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+    return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include <oqs/common.h>
+/**
+ * Clears and frees allocated memory.
+ * 
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+    OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ * 
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+    OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/params.c b/src/sig/mayo/pqmayo_mayo-1_opt/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+  const mayo_params_t nm = { \
+    .m = PARAM_JOIN2(nm, m), \
+    .n = PARAM_JOIN2(nm, n), \
+    .o = PARAM_JOIN2(nm, o), \
+    .k = PARAM_JOIN2(nm, k), \
+    .q = PARAM_JOIN2(nm, q), \
+    .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+    .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+    .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+    .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+    .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+    .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+    .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+    .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+    .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+    .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+    .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+    .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+    .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+    .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+    .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+    .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+    .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+    .name = #nm \
+  };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-1_opt/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-1_opt/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-1_opt/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+    // carryless multiply
+    unsigned char p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    unsigned char top_p = p & 0xf0;
+    unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+    return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+    // carryless multiply
+    uint64_t p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+    uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+    return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+    return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+    // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+    // 10, 4, 3, 8}; return table[a & 15];
+
+    unsigned char a2 = mul_f(a, a);
+    unsigned char a4 = mul_f(a2, a2);
+    unsigned char a8 = mul_f(a4, a4);
+    unsigned char a6 = mul_f(a2, a4);
+    unsigned char a14 = mul_f(a8, a6);
+
+    return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+                                    const unsigned char *b, int n, int m) {
+    unsigned char ret = 0;
+    for (int i = 0; i < n; ++i, b += m) {
+        ret = add_f(mul_f(a[i], *b), ret);
+    }
+    return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int colrow_ab, int row_a, int col_b) {
+    for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+        for (int j = 0; j < col_b; ++j, ++c) {
+            *c = lincomb(a, b + j, colrow_ab, col_b);
+        }
+    }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int m, int n) {
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+        }
+    }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+                                 uint64_t *out) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        out[i] = in[i];
+    }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+                                uint64_t *acc) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        acc[i] ^= in[i];
+    }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+    uint32_t a_msb;
+    uint32_t a32 = a;
+    uint32_t b32 = b;
+    uint32_t r32 = a32 * (b32 & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 1) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 2) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 3) & 1);
+
+    return r32;
+
+}
+ 
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/LICENSE b/src/sig/mayo/pqmayo_mayo-2_avx2/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/NOTICE b/src/sig/mayo/pqmayo_mayo-2_avx2/NOTICE
new file mode 100644
index 0000000000..53da47c21c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/NOTICE
@@ -0,0 +1,13 @@
+Copyright 2023 the MAYO team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/aes_ctr.h b/src/sig/mayo/pqmayo_mayo-2_avx2/aes_ctr.h
new file mode 100644
index 0000000000..8e41c30f2c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/aes_ctr.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AESCTR_H
+#define AESCTR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+                   const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                      const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include <aes.h>
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen) {
+    (void) inputByteLen;
+    uint8_t iv[12] = { 0 };
+    aes128ctr_prf(output, outputByteLen, input, iv);
+    return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/api.c b/src/sig/mayo/pqmayo_mayo-2_avx2/api.c
new file mode 100644
index 0000000000..a7cf85eedf
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_2
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+    return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk) {
+    return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk) {
+    return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk) {
+    return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk) {
+    if (siglen != CRYPTO_BYTES)
+        return -1;
+    return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/api.h b/src/sig/mayo/pqmayo_mayo-2_avx2/api.h
new file mode 100644
index 0000000000..265a5639db
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <mayo.h>
+
+#define CRYPTO_SECRETKEYBYTES 24
+#define CRYPTO_PUBLICKEYBYTES 5488
+#define CRYPTO_BYTES 180
+
+#define CRYPTO_ALGNAME "MAYO-2"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <arithmetic_common.h>
+#include <mem.h>
+#include <echelon_form.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+    mayo12_m_upper(m_legs, in, out, size);
+#else
+    int m_vecs_stored = 0;
+    for (int r = 0; r < size; r++) {
+        for (int c = r; c < size; c++) {
+            m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+            if (r != c) {
+                m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+            }
+            m_vecs_stored ++;
+        }
+    }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    (void) p;
+    mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    (void) p;   
+    mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    (void) p;   
+    mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+    #ifndef MAYO_VARIANT
+    const int m_legs = PARAM_m(p) / 32;
+    #else
+    (void) p;
+    #endif
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+    int 
+    bs_mat_entries_used = 0;
+    for (int r = 0; r < param_v; r++) {
+        for (int c = r; c < param_v; c++) {
+            if(c==r) {
+                bs_mat_entries_used += 1;
+                continue;
+            }
+            for (int k = 0; k < param_o; k += 1) {
+                
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+                vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k],  acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k],  acc + 6 * (r * param_o + k));
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k],  acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k],  acc + 8 * (r * param_o + k));
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k],  acc + 8 * (c * param_o + k));
+#else
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+        param_k, param_n - param_o, param_o);
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+    mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+        Y, param_k, param_n - param_o,
+        param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int m_legs = PARAM_m(p) / 32;
+    mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+    mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS) {
+    (void) m;
+#if MAYO_AVX
+    const int n = o + v;
+
+    /* Old approach which is constant time but doesn't have to be */
+    unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+    unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+    mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+    mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else 
+    NOT IMPLEMENTED
+#endif
+#else
+    const int n = o + v;
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+    mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+    mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+                           const unsigned char *y, const unsigned char *r,
+                           unsigned char *x, int k, int o, int m, int A_cols) {
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    unsigned char finished;
+    int col_upper_bound;
+    unsigned char correct_column;
+
+    // x <- r
+    for (int i = 0; i < k * o; i++) {
+        x[i] = r[i];
+    }
+
+    // compute Ar;
+    unsigned char Ar[M_MAX];
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+    }
+    mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+    // move y - Ar to last column of matrix A
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+    }
+
+    EF(A, m, k * o + 1);
+
+    // check if last row of A (excluding the last entry of y) is zero
+    unsigned char full_rank = 0;
+    for (int i = 0; i < A_cols - 1; i++) {
+        full_rank |= A[(m - 1) * A_cols + i];
+    }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+    if (full_rank == 0) {
+        return 0;
+    }
+
+    // back substitution in constant time
+    // the index of the first nonzero entry in each row is secret, which makes
+    // things less efficient
+
+    for (int row = m - 1; row >= 0; row--) {
+        finished = 0;
+        col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+        // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+        for (int col = row; col <= col_upper_bound; col++) {
+
+            // Compare two chars in constant time.
+            // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+            correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+            unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+            x[col] ^= u;
+
+            for (int i = 0; i < row; i += 8) {
+                uint64_t tmp = ( (uint64_t) A[ i    * A_cols + col] <<  0) ^ ( (uint64_t) A[(i+1) * A_cols + col] <<  8)
+                             ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+                             ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+                             ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+                tmp = mul_fx8(u, tmp);
+
+                A[ i    * A_cols + A_cols - 1] ^= (tmp      ) & 0xf;
+                A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+                A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+                A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+                A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+                A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+                A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+                A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+            }
+
+            finished = finished | correct_column;
+        }
+    }
+    return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <stdint.h>
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+    #include <shuffle_arithmetic_64.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+    #include <shuffle_arithmetic_96.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+    #include <shuffle_arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+    #include <arithmetic_64.h>
+    #include <arithmetic_96.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+    #include <arithmetic_96.h>
+    #include <arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+    #include <arithmetic_128.h>
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_128.h
new file mode 100644
index 0000000000..27b367e940
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_128.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+    out[6] = in[6];
+    out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+    acc[6] ^= in[6];
+    acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static 
+    inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_128(bins +  5 * 8, bins +  10 * 8);
+    m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+    m_vec_mul_add_x_inv_128(bins +  10 * 8, bins +  7 * 8);
+    m_vec_mul_add_x_128(bins + 12 * 8, bins +  6 * 8);
+    m_vec_mul_add_x_inv_128(bins +  7 * 8, bins +  14 * 8);
+    m_vec_mul_add_x_128(bins +  6 * 8, bins +  3 * 8);
+    m_vec_mul_add_x_inv_128(bins +  14 * 8, bins +  15 * 8);
+    m_vec_mul_add_x_128(bins +  3 * 8, bins +  8 * 8);
+    m_vec_mul_add_x_inv_128(bins +  15 * 8, bins +  13 * 8);
+    m_vec_mul_add_x_128(bins +  8 * 8, bins +  4 * 8);
+    m_vec_mul_add_x_inv_128(bins +  13 * 8, bins +  9 * 8);
+    m_vec_mul_add_x_128(bins +  4 * 8, bins +  2 * 8);
+    m_vec_mul_add_x_inv_128(bins +   9 * 8, bins +  1 * 8);
+    m_vec_mul_add_x_128(bins +  2 * 8, bins +  1 * 8);
+    vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_64.h
new file mode 100644
index 0000000000..9f7535c878
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_64.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+}
+
+static 
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_64(bins +  5 * 4, bins +  10 * 4);
+    m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+    m_vec_mul_add_x_inv_64(bins +  10 * 4, bins +  7 * 4);
+    m_vec_mul_add_x_64(bins + 12 * 4, bins +  6 * 4);
+    m_vec_mul_add_x_inv_64(bins +  7 * 4, bins +  14 * 4);
+    m_vec_mul_add_x_64(bins +  6 * 4, bins +  3 * 4);
+    m_vec_mul_add_x_inv_64(bins +  14 * 4, bins +  15 * 4);
+    m_vec_mul_add_x_64(bins +  3 * 4, bins +  8 * 4);
+    m_vec_mul_add_x_inv_64(bins +  15 * 4, bins +  13 * 4);
+    m_vec_mul_add_x_64(bins +  8 * 4, bins +  4 * 4);
+    m_vec_mul_add_x_inv_64(bins +  13 * 4, bins +  9 * 4);
+    m_vec_mul_add_x_64(bins +  4 * 4, bins +  2 * 4);
+    m_vec_mul_add_x_inv_64(bins +   9 * 4, bins +  1 * 4);
+    m_vec_mul_add_x_64(bins +  2 * 4, bins +  1 * 4);
+    vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_96.h
new file mode 100644
index 0000000000..86359679fb
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_96.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static 
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_96(bins +  5 * 6, bins +  10 * 6);
+    m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+    m_vec_mul_add_x_inv_96(bins +  10 * 6, bins +  7 * 6);
+    m_vec_mul_add_x_96(bins + 12 * 6, bins +  6 * 6);
+    m_vec_mul_add_x_inv_96(bins +  7 * 6, bins +  14 * 6);
+    m_vec_mul_add_x_96(bins +  6 * 6, bins +  3 * 6);
+    m_vec_mul_add_x_inv_96(bins +  14 * 6, bins +  15 * 6);
+    m_vec_mul_add_x_96(bins +  3 * 6, bins +  8 * 6);
+    m_vec_mul_add_x_inv_96(bins +  15 * 6, bins +  13 * 6);
+    m_vec_mul_add_x_96(bins +  8 * 6, bins +  4 * 6);
+    m_vec_mul_add_x_inv_96(bins +  13 * 6, bins +  9 * 6);
+    m_vec_mul_add_x_96(bins +  4 * 6, bins +  2 * 6);
+    m_vec_mul_add_x_inv_96(bins +   9 * 6, bins +  1 * 6);
+    m_vec_mul_add_x_96(bins +  2 * 6, bins +  1 * 6);
+    vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_common.h
new file mode 100644
index 0000000000..eeb13dc0bd
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/arithmetic_common.h
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include <stdint.h>
+#include <immintrin.h>
+
+#define K_OVER_2 ((K_MAX+1)/2)
+
+static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d,
+    0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09,
+    0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01
+};
+
+//
+// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper!
+//
+static inline __m256i tbl32_gf16_multab2( uint8_t b ) {
+
+    __m256i bx = _mm256_set1_epi16( b & 0xf );
+    __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+    const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+    const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+    const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+    const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+    __m256i mask_1  = _mm256_set1_epi16(1);
+    __m256i mask_4  = _mm256_set1_epi16(4);
+    __m256i mask_0  = _mm256_setzero_si256();
+
+    return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+           ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+           ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+           ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) {
+    return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f);
+}
+
+static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) {
+    __m256i multab_l = tbl32_gf16_multab2( b );
+    __m256i multab_h = _mm256_slli_epi16( multab_l, 4 );
+
+    return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) );
+}
+
+static 
+inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){
+    // build multiplication tables 
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        for (size_t c = 0; c < O_MAX; c+=2)
+        {
+            O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4);
+        }
+    }
+}
+
+
+static 
+inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){
+    // build multiplication tables 
+    size_t r;
+    for (size_t c = 0; c < V_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            V_multabs[K_OVER_2*c +  r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4);
+        }
+#if K_MAX % 2 == 1
+        V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]);
+#endif
+    }
+}
+
+static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = {
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, 
+    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+    0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, 
+    0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d,
+    0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, 
+    0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02,
+    0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, 
+    0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09,
+    0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, 
+    0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06,
+    0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, 
+    0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04,
+    0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, 
+    0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b,
+    0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, 
+    0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01,
+    0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, 
+    0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e,
+    0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, 
+    0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c,
+    0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, 
+    0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03,
+    0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, 
+    0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08,
+    0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, 
+    0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07,
+    0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, 
+    0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05,
+    0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a, 
+    0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a};
+
+
+static 
+inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) {
+    size_t r;
+    for (size_t c = 0; c < V_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            S1_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])) 
+                                          ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4);
+        }
+#if K_MAX % 2 == 1
+        S1_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c]));
+#endif
+    }
+}
+
+static 
+inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) {
+    // build multiplication tables 
+    size_t r;
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            S2_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) 
+                                          ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4);
+        }
+#if K_MAX % 2 == 1
+        S2_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ;
+#endif
+    }
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    uint64_t a_msb;
+    uint64_t a64 = a;
+    uint64_t b32 = b;
+    uint64_t r64 = a64 * (b32 & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 1) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 2) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 3) & 1);
+
+    return r64;
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form.h
new file mode 100644
index 0000000000..fa69de0ab2
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <immintrin.h>
+#include <stdint.h>
+
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+
+//
+// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/.
+//
+static inline __m256i tbl32_gf16_multab( uint8_t b ) {
+    __m256i bx = _mm256_set1_epi16( b & 0xf );
+    __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+    const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+    const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+    const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+    const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+    __m256i mask_1  = _mm256_set1_epi16(1);
+    __m256i mask_4  = _mm256_set1_epi16(4);
+    __m256i mask_0  = _mm256_setzero_si256();
+
+    return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+           ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+           ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+           ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+/* put matrix in row echelon form with ones on first nonzero entries in constant time*/
+static inline void EF(unsigned char *A, int _nrows, int _ncols) {
+
+    (void) _nrows;
+    (void) _ncols;
+
+    #define nrows M_MAX
+    #define ncols (K_MAX * O_MAX + 1)
+
+    #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32)
+    #define MAX_COLS (AVX_REGS_PER_ROW * 32)
+
+    __m256i _pivot_row[AVX_REGS_PER_ROW];
+    __m256i A_avx[AVX_REGS_PER_ROW* M_MAX];
+
+    unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row;
+    unsigned char* A_bytes = (unsigned char*) A_avx;
+
+    // load A in the tail of AVX2 registers
+    for (int i = 0; i < nrows; i++) {
+        for (int j = 0; j < ncols; j++)
+        {
+            A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ];
+        }
+    }
+
+    // pivot row is secret, pivot col is not
+    unsigned char inverse;
+    int pivot_row = 0;
+    int pivot_col = MAYO_MAX(MAX_COLS - ncols,0);
+    for (; pivot_col < MAX_COLS-128; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-96; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-64; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-32; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+
+    // write the matrix A back
+    for (int i = 0; i < nrows; i++) {
+        for (int j = 0; j < ncols; j++) {
+            A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j];
+        }
+    }
+    mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32);
+    mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form_loop.h
new file mode 100644
index 0000000000..b8b29741c4
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/echelon_form_loop.h
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+
+int pivot_col_rounded = pivot_col/32;
+
+int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS);
+int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols);
+/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/
+
+/* zero out pivot row */
+for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) {
+    _pivot_row[i] = _mm256_set1_epi8(0);
+}
+
+/* try to get a pivot row in constant time */
+unsigned char pivot = 0;
+uint32_t pivot_is_zero = -1;
+for (int row = pivot_row_lower_bound;
+        row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+    uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row);
+    uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row);
+    __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) );
+    for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+        _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j];
+    }
+    pivot = pivot_row_bytes[pivot_col];
+    pivot_is_zero = ~ct_compare_32((int) pivot, 0);
+}
+
+/* multiply pivot row by inverse of pivot */
+inverse = inverse_f(pivot);
+__m256i inverse_multab = tbl32_gf16_multab(inverse);
+
+for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+    _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]);
+}
+
+/* conditionally write pivot row to the correct row, if there is a nonzero pivot */
+/* eliminate entries below pivot */
+for (int row = pivot_row_lower_bound; row < nrows; row++) {
+    unsigned char below_pivot =  (unsigned char) (ct_is_greater_than(row, pivot_row));
+    unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col];
+
+    __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim);
+    if (row <= pivot_row_upper_bound) {
+        __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero);
+        for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) { 
+            A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^
+                                                    _mm256_shuffle_epi8(multab, _pivot_row[col]);
+        }
+    } else {
+        for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+            A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]);
+        }
+    }
+}
+
+pivot_row += (-(int32_t)(~pivot_is_zero));
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mem.h>
+#include <mayo.h>
+#include <randombytes.h>
+#include <aes_ctr.h>
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <fips202.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+    int i;
+    for (i = 0; i < mdeclen / 2; ++i) {
+        *mdec++ = m[i] & 0xf;
+        *mdec++ = m[i] >> 4;
+    }
+
+    if (mdeclen % 2 == 1) {
+        *mdec++ = m[i] & 0x0f;
+    }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+    int i;
+    for (i = 0; i < mlen / 2; ++i, m += 2) {
+        menc[i] = (*m) | (*(m + 1) << 4);
+    }
+
+    if (mlen % 2 == 1) {
+        menc[i] = (*m);
+    }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+
+    const uint64_t *vPv = _vPv;
+    uint64_t temp[M_MAX/16] = {0};
+    unsigned char *temp_bytes = (unsigned char *) temp;
+    int k = 0;
+    for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+        for (int j = i; j < PARAM_k(p); j++) {
+            // multiply by X (shift up 4 bits)
+            unsigned char top = temp[k] >> 60;
+            temp[k] <<= 4;
+            k--;
+            for(; k>=0; k--){
+                temp[k+1] ^= temp[k] >> 60;
+                temp[k] <<= 4;
+            }
+            // reduce mod f(X)
+            for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+                if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+                }
+                else {
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+                }
+            }
+
+            // extract from vPv and add
+            for(k=0; k < PARAM_m(p)/16; k ++){
+                temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+            }
+            k--;
+        }
+    }
+
+    // add to y
+    for (int i = 0; i < PARAM_m(p); i+=2)
+    {
+#ifdef TARGET_BIG_ENDIAN
+        y[i]   = t[i]   ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        y[i]   = t[i]   ^ (temp_bytes[i/2] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+    }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+    static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+    static const uint64_t even_bytes   = 0x00ff00ff00ff00ff;
+    static const uint64_t even_2bytes  = 0x0000ffff0000ffff;
+    static const uint64_t even_half    = 0x00000000ffffffff;
+
+    for (size_t i = 0; i < 16; i+=2)
+    {
+        uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; 
+        M[i  ] ^= t << 4;
+        M[i+1] ^= t;
+    }
+
+    for (size_t i = 0; i < 16; i+=4)
+    {
+        uint64_t t0 = ((M[i  ] >> 8) ^ M[i+2]) & even_bytes; 
+        uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; 
+        M[i  ] ^= (t0 << 8);
+        M[i+1] ^= (t1 << 8);
+        M[i+2] ^= t0;
+        M[i+3] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 4; i++)
+    {
+        uint64_t t0 = ((M[i  ] >> 16) ^ M[i+ 4]) & even_2bytes;
+        uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+        M[i   ] ^= t0 << 16;
+        M[i+ 8] ^= t1 << 16;
+        M[i+ 4] ^= t0;
+        M[i+12] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 8; i++)
+    {
+        uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; 
+        M[i  ] ^= t << 32;
+        M[i+8] ^= t;
+    }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    
+    const uint64_t *VtL = _VtL;
+    int bits_to_shift = 0;
+    int words_to_shift = 0;
+    uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+    size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+    const uint64_t *Mi, *Mj;
+
+    for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+        for (int j = PARAM_k(p) - 1; j >= i; --j) {
+            // add the M_i and M_j to A, shifted "down" by l positions
+            Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+            for (int c = 0; c < PARAM_o(p); c++) {
+                for (int k = 0; k < PARAM_m(p)/16; k++)
+                {
+                    A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+                    if(bits_to_shift > 0){
+                        A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                    }
+                }
+            }
+
+            if (i != j) {
+                Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+                for (int c = 0; c < PARAM_o(p); c++) {
+                    for (int k = 0; k < PARAM_m(p)/16; k++)
+                    {
+                        A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+                        if(bits_to_shift > 0){
+                            A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                        }
+                    }
+                }
+            }
+
+            bits_to_shift += 4;
+            if(bits_to_shift == 64){
+                words_to_shift ++;
+                bits_to_shift = 0;
+            }
+        }
+    }
+
+    for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+    {
+        transpose_16x16_nibbles(A + c);
+    }
+
+    unsigned char tab[F_TAIL_LEN*4] = {0};
+    for (size_t i = 0; i < F_TAIL_LEN; i++)
+    {
+        tab[4*i]   = mul_f(PARAM_f_tail(p)[i],1);
+        tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+        tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+        tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+    }
+
+    uint64_t low_bit_in_nibble = 0x1111111111111111;
+    
+    for (size_t c = 0; c < A_width; c+= 16)
+    {
+        for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+        {
+            size_t pos = (r/16)*A_width + c + (r%16);
+            uint64_t t0 =  A[pos]       & low_bit_in_nibble;
+            uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+            uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+            uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+            for (size_t t = 0; t < F_TAIL_LEN; t++)
+            {
+                A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+            }
+        }
+    }
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) 
+        A[i] = BSWAP64(A[i]);
+#endif
+
+    for (int r = 0; r < PARAM_m(p); r+=16)
+    {
+        for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+        {
+            for (size_t i = 0; i < 16; i++)
+            {
+                decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+            }   
+        }
+    }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+    int ret = 0;
+
+    ret = mayo_keypair_compact(p, pk, sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+    unsigned char y[M_MAX];                    // secret data
+    unsigned char salt[SALT_BYTES_MAX];        // not secret data
+    unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+             Vdec[N_MINUS_O_MAX * K_MAX];                 // secret data
+    unsigned char A[M_MAX * (K_MAX * O_MAX + 1)];   // secret data
+    unsigned char x[K_MAX * N_MAX];                       // not secret data
+    unsigned char r[K_MAX * O_MAX + 1] = { 0 };           // secret data
+    unsigned char s[K_MAX * N_MAX];                       // not secret data
+    const unsigned char *seed_sk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+    alignas(32) sk_t sk;                    // secret data
+    unsigned char Ox[N_MINUS_O_MAX];        // secret data
+    // unsigned char Mdigest[DIGEST_BYTES];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+    unsigned char *ctrbyte;
+    unsigned char *vi;
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_v_bytes = PARAM_v_bytes(p);
+    const int param_r_bytes = PARAM_r_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_A_cols = PARAM_A_cols(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    ret = mayo_expand_sk(p, csk, &sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+    seed_sk = csk;
+    decode(sk.o, O, (param_n - param_o) * param_o);
+
+    // hash message
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    uint64_t *P1 = sk.p;
+    uint64_t *L  = P1 + (param_P1_bytes/8);
+    alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        L[i] = BSWAP64(L[i]);
+    }
+#endif
+
+    // choose the randomizer
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(tmp + param_digest_bytes, param_salt_bytes);
+    #else
+    if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // hashing to salt
+    memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+           param_sk_seed_bytes);
+    shake256(salt, param_salt_bytes, tmp,
+             param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+    // hashing to t
+    memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+    ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+    shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+    decode(tenc, t, param_m); // may not be necessary
+
+    for (int ctr = 0; ctr <= 255; ++ctr) {
+        *ctrbyte = (unsigned char)ctr;
+
+        shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+                 param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+        // decode the v_i vectors
+        for (int i = 0; i <= param_k - 1; ++i) {
+            decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+                   param_n - param_o);
+        }
+
+        // compute all the V * L matrices.
+        // compute all the V * P1 * V^T matrices.
+        alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+        V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+        compute_rhs(p, Y, t, y);
+        compute_A(p, Mtmp, A);
+
+        decode(V + param_k * param_v_bytes, r,
+               param_k *
+               param_o);
+        if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+            break;
+        } else {
+            memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+        }
+    }
+
+    // s is already 0
+    // TODO: optimize this?
+    for (int i = 0; i <= param_k - 1; ++i) {
+        vi = Vdec + i * (param_n - param_o);
+        mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+        mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+        memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+    }
+    encode(s, sig, param_n * param_k);
+    memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+    *siglen = param_sig_bytes;
+err:
+    mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+    mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+    mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+    mayo_secure_clear(r, K_MAX * O_MAX + 1);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(&sk, sizeof(sk_t));
+    mayo_secure_clear(Ox, N_MINUS_O_MAX);
+    mayo_secure_clear(tmp,
+                      DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+    return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    size_t siglen = param_sig_bytes;
+    ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+    if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+        goto err;
+
+    memmove(sm + param_sig_bytes, m, mlen);
+    *smlen = siglen + mlen;
+err:
+    return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk) {
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    if (smlen < (size_t)param_sig_bytes) {
+        return MAYO_ERR;
+    }
+    int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+                             pk);
+
+    if (result == MAYO_OK) {
+        *mlen = smlen - param_sig_bytes;
+        memmove(m, sm + param_sig_bytes, *mlen);
+    }
+
+    return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char *seed_sk = csk;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+    alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+    unsigned char *seed_pk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    // seed_sk $←- B^(sk_seed bytes)
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(seed_sk, param_sk_seed_bytes);
+    #else
+    if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // S ← shake256(seedsk, pk seed bytes + O bytes)
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    // seed_pk ← s[0 : pk_seed_bytes]
+    seed_pk = S;
+
+    // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+    decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+
+    int m_legs = param_m / 32;
+
+    uint64_t *P1 = P;
+    uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+    // compute P3 = O^t * (P1*O + P2)
+    Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+    // store seed_pk in cpk
+    memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+    alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+    // compute Upper(P3) and store in cpk
+    m_upper(m_legs, P3, P3_upper, param_o);
+    
+    memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+    return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *pk) {
+    #ifdef MAYO_VARIANT
+    (void)p;
+    #endif
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+    pk += param_P1_bytes + param_P2_bytes;
+    memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+    return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *sk) {
+    int ret = MAYO_OK;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    uint64_t *P = sk->p;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_v(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    const unsigned char *seed_sk = csk;
+    unsigned char *seed_pk = S;
+
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    decode(S + param_pk_seed_bytes, O,
+           param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+    uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+    
+    uint64_t *P1 = P;
+    // compute L_i = (P1 + P1^t)*O + P2
+    uint64_t *L = P2;
+    P1P1t_times_O(p, P1, O, L);
+
+    // write to sk
+    memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+
+    mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *cpk) {
+    unsigned char tEnc[M_BYTES_MAX];
+    unsigned char t[M_MAX];
+    unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+    unsigned char s[K_MAX * N_MAX];
+    alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+    if (ret != MAYO_OK) {
+        return MAYO_ERR;
+    }
+
+    uint64_t *P1 = pk;
+    uint64_t *P2 = pk + (param_P1_bytes / 8);
+    uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        P2[i] = BSWAP64(P2[i]);
+    }
+    for (int i = 0; i < param_P3_bytes / 8; ++i) {
+        P3[i] = BSWAP64(P3[i]);
+    }
+#endif
+
+    // hash m
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    // compute t
+    memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+           param_salt_bytes);
+    shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+    decode(tEnc, t, param_m);
+
+    // decode s
+    decode(sig, s, param_k * param_n);
+
+    // Compute S*P*S^T
+    alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+    m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+                             param_v, param_o, param_k, SPS);
+
+    // combine the vectors in SPS and reduce mod f(X)
+    compute_rhs(p, SPS, y, y);
+    
+    if (memcmp(y, t, param_m) == 0) {
+        return MAYO_OK; // good signature
+    }
+    return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64                                                              \
+  { 8, 0, 2, 8, 0 } // f(z) =  z^64         + x^3*z^3 + x*z^2         + x^3
+#define F_TAIL_96                                                              \
+  { 2, 2, 0, 2, 0 } // f(z) =  z^96         +   x*z^3         + x*z   + x
+#define F_TAIL_128                                                             \
+  { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3         + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+    int m;
+    int n;
+    int o;
+    int k;
+    int q;
+    const unsigned char *f_tail;
+    int m_bytes;
+    int O_bytes;
+    int v_bytes;
+    int r_bytes;
+    int R_bytes;
+    int P1_bytes;
+    int P2_bytes;
+    int P3_bytes;
+    int csk_bytes;
+    int esk_bytes;
+    int cpk_bytes;
+    int epk_bytes;
+    int sig_bytes;
+    int salt_bytes;
+    int sk_seed_bytes;
+    int digest_bytes;
+    int pk_seed_bytes;
+    const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+    uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+    uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ * 
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/mem.h b/src/sig/mayo/pqmayo_mayo-2_avx2/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+    int32_t diff = b - a;
+    return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+    int64_t diff = ((int64_t) b) - ((int64_t) a);
+    return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+    return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+    return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+    return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include <oqs/common.h>
+/**
+ * Clears and frees allocated memory.
+ * 
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+    OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ * 
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+    OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/params.c b/src/sig/mayo/pqmayo_mayo-2_avx2/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+  const mayo_params_t nm = { \
+    .m = PARAM_JOIN2(nm, m), \
+    .n = PARAM_JOIN2(nm, n), \
+    .o = PARAM_JOIN2(nm, o), \
+    .k = PARAM_JOIN2(nm, k), \
+    .q = PARAM_JOIN2(nm, q), \
+    .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+    .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+    .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+    .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+    .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+    .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+    .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+    .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+    .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+    .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+    .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+    .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+    .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+    .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+    .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+    .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+    .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+    .name = #nm \
+  };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_128.h
new file mode 100644
index 0000000000..27b416adce
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_128.h
@@ -0,0 +1,524 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_128_H
+#define SHUFFLE_ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            cols_used ++;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); 
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[(2*r*O_MAX) + 2*k]     ^= temp[2*k]     ^ _mm256_slli_epi16(t0,4);
+            acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+            acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0;
+            acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1;
+        }
+    }
+}
+
+static 
+inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+    const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c + 1]  ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(k*O_MAX) + 2*c]      ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*O_MAX) + 2*c]     ^= temp[2*k+1] ^ t0;
+            acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1;
+        }
+    }
+}
+
+
+static
+inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        cols_used += 1;
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            pos += (V_MAX -c - 1);
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k    ));
+            __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1));
+            __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2));
+            __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3));
+
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k),     acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0);
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1);
+        }
+    }
+}
+
+
+static 
+inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *L = (__m256i *) _L;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c]     ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c + 1]     ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+        acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+static 
+inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+static 
+inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *Pv = (__m256i *) _Pv;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+
+        // P1 * S1
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            P1_cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // P2 * S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P3 = (__m256i *) _P3;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+static
+inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+    const __m256i *PS2 = (__m256i *) _PS2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_64.h
new file mode 100644
index 0000000000..defff86f8f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_64.h
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_64_H
+#define SHUFFLE_ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+    }
+}
+
+
+static 
+inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*O_MAX) + c]     ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+        }
+    }
+}
+
+static
+inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        cols_used += 1;
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + pos);
+            pos += (V_MAX -c - 1);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k    ));
+            __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1));
+
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256(acc + (r*O_MAX + k    ), acc0 ^ temp[k  ] ^ _mm256_slli_epi16(t,4));
+            _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t);
+        }
+    }
+}
+
+
+static 
+inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *L = (__m256i *) _L;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+static 
+inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *Pv = (__m256i *) _Pv;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+static 
+inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular
+// same as mayo_12_P1_times_Vt_avx2
+static
+inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+    const __m256i *PS2 = (__m256i *) _PS2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P2 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+
+        // P1 * S1
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            P1_cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // P2 * S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P3 = (__m256i *) _P3;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P3 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+static inline
+void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+    (void) size;
+    int m_vecs_stored = 0;
+
+    for (int r = 0; r < O_MAX; ++r) {
+        const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r));
+        __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+        _out[0] = _in[0];
+        m_vecs_stored++;
+        for (int c = r + 1; c < O_MAX; ++c) {
+            const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c));
+            const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r));
+            _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+            _out[0] = _in2[0] ^ _in3[0];
+            m_vecs_stored++;
+        }
+    }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_96.h
new file mode 100644
index 0000000000..9b3a69d567
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/shuffle_arithmetic_96.h
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_96_H
+#define SHUFFLE_ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){
+
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static 
+inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){
+    
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static
+inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        cols_used ++;
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            pos += (V_MAX -c - 1);
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static 
+inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){
+
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static 
+inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){
+      const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static 
+inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        // P1 times S1
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            P1_cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // P2 times S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static
+inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){
+    mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc);
+}
+
+static
+inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-2_avx2/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_avx2/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+    // carryless multiply
+    unsigned char p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    unsigned char top_p = p & 0xf0;
+    unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+    return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+    // carryless multiply
+    uint64_t p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+    uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+    return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+    return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+    // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+    // 10, 4, 3, 8}; return table[a & 15];
+
+    unsigned char a2 = mul_f(a, a);
+    unsigned char a4 = mul_f(a2, a2);
+    unsigned char a8 = mul_f(a4, a4);
+    unsigned char a6 = mul_f(a2, a4);
+    unsigned char a14 = mul_f(a8, a6);
+
+    return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+                                    const unsigned char *b, int n, int m) {
+    unsigned char ret = 0;
+    for (int i = 0; i < n; ++i, b += m) {
+        ret = add_f(mul_f(a[i], *b), ret);
+    }
+    return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int colrow_ab, int row_a, int col_b) {
+    for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+        for (int j = 0; j < col_b; ++j, ++c) {
+            *c = lincomb(a, b + j, colrow_ab, col_b);
+        }
+    }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int m, int n) {
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+        }
+    }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+                                 uint64_t *out) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        out[i] = in[i];
+    }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+                                uint64_t *acc) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        acc[i] ^= in[i];
+    }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+    uint32_t a_msb;
+    uint32_t a32 = a;
+    uint32_t b32 = b;
+    uint32_t r32 = a32 * (b32 & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 1) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 2) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 3) & 1);
+
+    return r32;
+
+}
+ 
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/LICENSE b/src/sig/mayo/pqmayo_mayo-2_opt/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/NOTICE b/src/sig/mayo/pqmayo_mayo-2_opt/NOTICE
new file mode 100644
index 0000000000..53da47c21c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/NOTICE
@@ -0,0 +1,13 @@
+Copyright 2023 the MAYO team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/aes_ctr.h b/src/sig/mayo/pqmayo_mayo-2_opt/aes_ctr.h
new file mode 100644
index 0000000000..8e41c30f2c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/aes_ctr.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AESCTR_H
+#define AESCTR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+                   const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                      const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include <aes.h>
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen) {
+    (void) inputByteLen;
+    uint8_t iv[12] = { 0 };
+    aes128ctr_prf(output, outputByteLen, input, iv);
+    return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/api.c b/src/sig/mayo/pqmayo_mayo-2_opt/api.c
new file mode 100644
index 0000000000..a7cf85eedf
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_2
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+    return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk) {
+    return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk) {
+    return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk) {
+    return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk) {
+    if (siglen != CRYPTO_BYTES)
+        return -1;
+    return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/api.h b/src/sig/mayo/pqmayo_mayo-2_opt/api.h
new file mode 100644
index 0000000000..265a5639db
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <mayo.h>
+
+#define CRYPTO_SECRETKEYBYTES 24
+#define CRYPTO_PUBLICKEYBYTES 5488
+#define CRYPTO_BYTES 180
+
+#define CRYPTO_ALGNAME "MAYO-2"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <arithmetic_common.h>
+#include <mem.h>
+#include <echelon_form.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+    mayo12_m_upper(m_legs, in, out, size);
+#else
+    int m_vecs_stored = 0;
+    for (int r = 0; r < size; r++) {
+        for (int c = r; c < size; c++) {
+            m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+            if (r != c) {
+                m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+            }
+            m_vecs_stored ++;
+        }
+    }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    (void) p;
+    mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    (void) p;   
+    mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    (void) p;   
+    mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+    #ifndef MAYO_VARIANT
+    const int m_legs = PARAM_m(p) / 32;
+    #else
+    (void) p;
+    #endif
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+    int 
+    bs_mat_entries_used = 0;
+    for (int r = 0; r < param_v; r++) {
+        for (int c = r; c < param_v; c++) {
+            if(c==r) {
+                bs_mat_entries_used += 1;
+                continue;
+            }
+            for (int k = 0; k < param_o; k += 1) {
+                
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+                vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k],  acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k],  acc + 6 * (r * param_o + k));
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k],  acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k],  acc + 8 * (r * param_o + k));
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k],  acc + 8 * (c * param_o + k));
+#else
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+        param_k, param_n - param_o, param_o);
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+    mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+        Y, param_k, param_n - param_o,
+        param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int m_legs = PARAM_m(p) / 32;
+    mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+    mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS) {
+    (void) m;
+#if MAYO_AVX
+    const int n = o + v;
+
+    /* Old approach which is constant time but doesn't have to be */
+    unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+    unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+    mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+    mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else 
+    NOT IMPLEMENTED
+#endif
+#else
+    const int n = o + v;
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+    mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+    mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+                           const unsigned char *y, const unsigned char *r,
+                           unsigned char *x, int k, int o, int m, int A_cols) {
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    unsigned char finished;
+    int col_upper_bound;
+    unsigned char correct_column;
+
+    // x <- r
+    for (int i = 0; i < k * o; i++) {
+        x[i] = r[i];
+    }
+
+    // compute Ar;
+    unsigned char Ar[M_MAX];
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+    }
+    mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+    // move y - Ar to last column of matrix A
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+    }
+
+    EF(A, m, k * o + 1);
+
+    // check if last row of A (excluding the last entry of y) is zero
+    unsigned char full_rank = 0;
+    for (int i = 0; i < A_cols - 1; i++) {
+        full_rank |= A[(m - 1) * A_cols + i];
+    }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+    if (full_rank == 0) {
+        return 0;
+    }
+
+    // back substitution in constant time
+    // the index of the first nonzero entry in each row is secret, which makes
+    // things less efficient
+
+    for (int row = m - 1; row >= 0; row--) {
+        finished = 0;
+        col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+        // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+        for (int col = row; col <= col_upper_bound; col++) {
+
+            // Compare two chars in constant time.
+            // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+            correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+            unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+            x[col] ^= u;
+
+            for (int i = 0; i < row; i += 8) {
+                uint64_t tmp = ( (uint64_t) A[ i    * A_cols + col] <<  0) ^ ( (uint64_t) A[(i+1) * A_cols + col] <<  8)
+                             ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+                             ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+                             ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+                tmp = mul_fx8(u, tmp);
+
+                A[ i    * A_cols + A_cols - 1] ^= (tmp      ) & 0xf;
+                A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+                A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+                A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+                A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+                A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+                A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+                A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+            }
+
+            finished = finished | correct_column;
+        }
+    }
+    return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <stdint.h>
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+    #include <shuffle_arithmetic_64.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+    #include <shuffle_arithmetic_96.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+    #include <shuffle_arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+    #include <arithmetic_64.h>
+    #include <arithmetic_96.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+    #include <arithmetic_96.h>
+    #include <arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+    #include <arithmetic_128.h>
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_128.h
new file mode 100644
index 0000000000..418c308e2f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_128.h
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_64.h>
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+    out[6] = in[6];
+    out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+    acc[6] ^= in[6];
+    acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 8;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 8;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 8;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_128(bins +  5 * 8, bins +  10 * 8);
+    m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+    m_vec_mul_add_x_inv_128(bins +  10 * 8, bins +  7 * 8);
+    m_vec_mul_add_x_128(bins + 12 * 8, bins +  6 * 8);
+    m_vec_mul_add_x_inv_128(bins +  7 * 8, bins +  14 * 8);
+    m_vec_mul_add_x_128(bins +  6 * 8, bins +  3 * 8);
+    m_vec_mul_add_x_inv_128(bins +  14 * 8, bins +  15 * 8);
+    m_vec_mul_add_x_128(bins +  3 * 8, bins +  8 * 8);
+    m_vec_mul_add_x_inv_128(bins +  15 * 8, bins +  13 * 8);
+    m_vec_mul_add_x_128(bins +  8 * 8, bins +  4 * 8);
+    m_vec_mul_add_x_inv_128(bins +  13 * 8, bins +  9 * 8);
+    m_vec_mul_add_x_128(bins +  4 * 8, bins +  2 * 8);
+    m_vec_mul_add_x_inv_128(bins +   9 * 8, bins +  1 * 8);
+    m_vec_mul_add_x_128(bins +  2 * 8, bins +  1 * 8);
+    vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_64.h
new file mode 100644
index 0000000000..a70b7a3118
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_64.h
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    uint64_t a_msb;
+    uint64_t a64 = a;
+    uint64_t b32 = b;
+    uint64_t r64 = a64 * (b32 & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 1) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 2) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 3) & 1);
+
+    return r64;
+}
+
+static inline uint32_t mul_table(uint8_t b){
+    uint32_t x = ((uint32_t) b) * 0x08040201;
+
+    uint32_t high_nibble_mask = 0xf0f0f0f0;
+
+    uint32_t high_half = x & high_nibble_mask;
+    return (x ^ (high_half >> 4) ^ (high_half >> 3));
+}
+
+static 
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 4;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < legs; i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 4;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 4;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_64(bins +  5 * 4, bins +  10 * 4);
+    m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+    m_vec_mul_add_x_inv_64(bins +  10 * 4, bins +  7 * 4);
+    m_vec_mul_add_x_64(bins + 12 * 4, bins +  6 * 4);
+    m_vec_mul_add_x_inv_64(bins +  7 * 4, bins +  14 * 4);
+    m_vec_mul_add_x_64(bins +  6 * 4, bins +  3 * 4);
+    m_vec_mul_add_x_inv_64(bins +  14 * 4, bins +  15 * 4);
+    m_vec_mul_add_x_64(bins +  3 * 4, bins +  8 * 4);
+    m_vec_mul_add_x_inv_64(bins +  15 * 4, bins +  13 * 4);
+    m_vec_mul_add_x_64(bins +  8 * 4, bins +  4 * 4);
+    m_vec_mul_add_x_inv_64(bins +  13 * 4, bins +  9 * 4);
+    m_vec_mul_add_x_64(bins +  4 * 4, bins +  2 * 4);
+    m_vec_mul_add_x_inv_64(bins +   9 * 4, bins +  1 * 4);
+    m_vec_mul_add_x_64(bins +  2 * 4, bins +  1 * 4);
+    vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_96.h
new file mode 100644
index 0000000000..a38f89e454
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_96.h
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_64.h>
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 6;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 6;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 6;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_96(bins +  5 * 6, bins +  10 * 6);
+    m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+    m_vec_mul_add_x_inv_96(bins +  10 * 6, bins +  7 * 6);
+    m_vec_mul_add_x_96(bins + 12 * 6, bins +  6 * 6);
+    m_vec_mul_add_x_inv_96(bins +  7 * 6, bins +  14 * 6);
+    m_vec_mul_add_x_96(bins +  6 * 6, bins +  3 * 6);
+    m_vec_mul_add_x_inv_96(bins +  14 * 6, bins +  15 * 6);
+    m_vec_mul_add_x_96(bins +  3 * 6, bins +  8 * 6);
+    m_vec_mul_add_x_inv_96(bins +  15 * 6, bins +  13 * 6);
+    m_vec_mul_add_x_96(bins +  8 * 6, bins +  4 * 6);
+    m_vec_mul_add_x_inv_96(bins +  13 * 6, bins +  9 * 6);
+    m_vec_mul_add_x_96(bins +  4 * 6, bins +  2 * 6);
+    m_vec_mul_add_x_inv_96(bins +   9 * 6, bins +  1 * 6);
+    m_vec_mul_add_x_96(bins +  2 * 6, bins +  1 * 6);
+    vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_common.h
new file mode 100644
index 0000000000..d337bc238c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/arithmetic_common.h
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include <stdalign.h>
+
+#ifndef MAYO_VARIANT
+static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) {
+
+    m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2);
+    m_vec_add(m_legs, bins + 15 * m_legs * 2, bins +  3 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 14 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 14 * m_legs * 2, bins +  6 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2);
+    m_vec_add(m_legs, bins + 13 * m_legs * 2, bins +  7 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 12 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 12 * m_legs * 2, bins +  4 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 11 * m_legs * 2, bins +  9 * m_legs * 2);
+    m_vec_add(m_legs, bins + 11 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 10 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 10 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 9 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 9 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 7 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 7 * m_legs * 2, bins +  3 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 6 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 6 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 5 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 5 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 3 * m_legs * 2, bins +  2 * m_legs * 2);
+    m_vec_add(m_legs, bins + 3 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2);
+    m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2);
+    m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2);
+
+    m_vec_copy(m_legs, bins + 1 * m_legs * 2, out);
+}
+#endif
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *PS) {
+
+    const int n = o + v;
+#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128)
+    (void)m;
+#else
+    const int m_legs = m / 32;
+#endif
+
+    /* Old approach which is constant time but doesn't have to be
+    unsigned char S1[V_MAX*K_MAX];
+    unsigned char S2[O_MAX*K_MAX];
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2.
+    */
+
+    // use more stack efficient version for MAYO_3 and MAYO_5
+    #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78
+    uint64_t accumulator[M_MAX * N_MAX] = {0};
+    int P1_used;
+    int P3_used;
+    for (int col = 0; col < k; col++) {
+        for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) {
+            accumulator[i] = 0;
+        }
+        P1_used = 0;
+        for (int row = 0; row < v; row++) {
+            for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8);
+#else
+                bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+                P1_used ++;
+            }
+
+            for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 );
+#else
+                bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+#endif
+            }
+        }
+
+        P3_used = 0;
+        for (int row = v; row < n; row++) {
+            for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8);
+#else
+                bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+                P3_used ++;
+            }
+        }
+
+        for (int row = 0; row < n; row++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+           multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+           multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+           multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8);
+#else
+           bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2);
+#endif
+        }
+    }
+
+    #else
+
+    alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0};
+    int P1_used = 0;
+    for (int row = 0; row < v; row++) {
+        for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+            }
+#endif
+            P1_used ++;
+        }
+
+
+        for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+            }
+#endif
+        }
+    }
+
+    int P3_used = 0;
+    for (int row = v; row < n; row++) {
+        for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+            }
+#endif
+            P3_used ++;
+        }
+    }
+
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < n * k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+        multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+        multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+        multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8);
+        i++;
+#else
+        m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2);
+        i++;
+#endif
+    }
+
+    #endif
+}
+
+
+static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int  n, uint64_t *SPS){
+    alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0};
+    #if !defined(MAYO_VARIANT)
+    const int m_legs = m/32;
+    #else
+    (void) m;
+    #endif
+    for (int row = 0; row < k; row++) {
+        for (int j = 0; j < n; j++) {
+            for (int col = 0; col < k; col += 1) {
+                #if defined(MAYO_VARIANT) && (M_MAX == 64)
+                    vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 );
+                #elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                    vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 );
+                #elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                    vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 );
+                #else
+                    m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 );
+                #endif
+            }
+        }
+    }
+
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < k*k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+        multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+        multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+        multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8);
+        i++;
+#else
+        m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2);
+        i++;
+#endif
+    }
+}
+
+
+// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) {
+
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k));
+#endif
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+
+// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) {
+
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_rows; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k));
+#endif
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+
+// multiplies the transpose of a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+    for (int r = 0; r < mat_cols; r++) {
+        for (int c = 0; c < mat_rows; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+            }
+        }
+    }
+}
+
+// multiplies a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+    for (int r = 0; r < mat_rows; r++) {
+        for (int c = 0; c < mat_cols; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+            }
+        }
+    }
+}
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo-2_opt/echelon_form.h
new file mode 100644
index 0000000000..82505847c9
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/echelon_form.h
@@ -0,0 +1,152 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ECHELON_FORM_H
+#define ECHELON_FORM_H
+
+#include <stdalign.h>
+#include <stdint.h>
+#include <mem.h>
+#include <arithmetic.h>
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+static inline unsigned char
+m_extract_element(const uint64_t *in, int index) {
+    const int leg = index / 16;
+    const int offset = index % 16;
+
+    return (in[leg] >> (offset*4)) & 0xF;
+}
+
+static inline void
+ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) {
+    int i;
+    unsigned char *out8 = (unsigned char *)out;
+    for(i = 0; i+1 < ncols; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0) | (in[i+1] << 4);
+#else
+        out8[i/2]  = (in[i+0] << 0) | (in[i+1] << 4);
+#endif
+    }
+    if (ncols % 2 == 1){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0);
+#else
+        out8[i/2]  = (in[i+0] << 0);
+#endif
+    }
+}
+
+static inline void
+ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) {
+    const unsigned char *in8 = (const unsigned char *)in;
+    for(int i = 0; i < legs * 16; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out[i]   = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF;
+        out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        out[i]   = (in8[i/2]) & 0xF;
+        out[i+1] = (in8[i/2] >> 4);
+#endif
+    }
+}
+
+
+// put matrix in row echelon form with ones on first nonzero entries *in
+// constant time*
+static inline void EF(unsigned char *A, int nrows, int ncols) {
+
+    alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 };
+
+    int row_len = (ncols + 15) / 16;
+
+    // nibbleslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols);
+    }
+
+    // pivot row is secret, pivot col is not
+
+    unsigned char inverse;
+    int pivot_row = 0;
+    for (int pivot_col = 0; pivot_col < ncols; pivot_col++) {
+
+        int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols);
+        int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col);
+        // the pivot row is guaranteed to be between these lower and upper bounds if
+        // A has full rank
+
+        // zero out pivot row
+        for (int i = 0; i < row_len; i++) {
+            _pivot_row[i] = 0;
+            _pivot_row2[i] = 0;
+        }
+
+        // try to get a pivot row in constant time
+        unsigned char pivot = 0;
+        uint64_t pivot_is_zero = -1;
+        for (int row = pivot_row_lower_bound;
+                row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+
+            uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row);
+            uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row);
+
+            for (int j = 0; j < row_len; j++) {
+                _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) &
+                                 packed_A[row * row_len + j];
+            }
+            pivot = m_extract_element(_pivot_row, pivot_col);
+            pivot_is_zero = ~ct_compare_64((int) pivot, 0);
+        }
+
+        // multiply pivot row by inverse of pivot
+        inverse = inverse_f(pivot);
+        vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2);
+
+        // conditionally write pivot row to the correct row, if there is a nonzero
+        // pivot
+        for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) {
+            uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero;
+            uint64_t do_not_copy = ~do_copy;
+            for (int col = 0; col < row_len; col++) {
+                packed_A[row * row_len + col] =
+                    (do_not_copy & packed_A[row * row_len + col]) +
+                    (do_copy & _pivot_row2[col]);
+            }
+        }
+
+        // eliminate entries below pivot
+        for (int row = pivot_row_lower_bound; row < nrows; row++) {
+            unsigned char below_pivot = (row > pivot_row);
+            unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col);
+
+            vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim,
+                                    packed_A + row * row_len);                            
+        }
+
+        pivot_row += (-(int64_t)(~pivot_is_zero));
+    }
+
+    unsigned char temp[(O_MAX * K_MAX + 1 + 15)];
+
+    // unbitslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_unpack_m_vec(row_len, packed_A + i * row_len, temp);
+        for (int j = 0; j < ncols; j++) {
+            A[i * ncols + j] = temp[j];
+        }
+    }
+
+    mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15);
+    mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/mayo.c b/src/sig/mayo/pqmayo_mayo-2_opt/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mem.h>
+#include <mayo.h>
+#include <randombytes.h>
+#include <aes_ctr.h>
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <fips202.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+    int i;
+    for (i = 0; i < mdeclen / 2; ++i) {
+        *mdec++ = m[i] & 0xf;
+        *mdec++ = m[i] >> 4;
+    }
+
+    if (mdeclen % 2 == 1) {
+        *mdec++ = m[i] & 0x0f;
+    }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+    int i;
+    for (i = 0; i < mlen / 2; ++i, m += 2) {
+        menc[i] = (*m) | (*(m + 1) << 4);
+    }
+
+    if (mlen % 2 == 1) {
+        menc[i] = (*m);
+    }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+
+    const uint64_t *vPv = _vPv;
+    uint64_t temp[M_MAX/16] = {0};
+    unsigned char *temp_bytes = (unsigned char *) temp;
+    int k = 0;
+    for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+        for (int j = i; j < PARAM_k(p); j++) {
+            // multiply by X (shift up 4 bits)
+            unsigned char top = temp[k] >> 60;
+            temp[k] <<= 4;
+            k--;
+            for(; k>=0; k--){
+                temp[k+1] ^= temp[k] >> 60;
+                temp[k] <<= 4;
+            }
+            // reduce mod f(X)
+            for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+                if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+                }
+                else {
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+                }
+            }
+
+            // extract from vPv and add
+            for(k=0; k < PARAM_m(p)/16; k ++){
+                temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+            }
+            k--;
+        }
+    }
+
+    // add to y
+    for (int i = 0; i < PARAM_m(p); i+=2)
+    {
+#ifdef TARGET_BIG_ENDIAN
+        y[i]   = t[i]   ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        y[i]   = t[i]   ^ (temp_bytes[i/2] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+    }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+    static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+    static const uint64_t even_bytes   = 0x00ff00ff00ff00ff;
+    static const uint64_t even_2bytes  = 0x0000ffff0000ffff;
+    static const uint64_t even_half    = 0x00000000ffffffff;
+
+    for (size_t i = 0; i < 16; i+=2)
+    {
+        uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; 
+        M[i  ] ^= t << 4;
+        M[i+1] ^= t;
+    }
+
+    for (size_t i = 0; i < 16; i+=4)
+    {
+        uint64_t t0 = ((M[i  ] >> 8) ^ M[i+2]) & even_bytes; 
+        uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; 
+        M[i  ] ^= (t0 << 8);
+        M[i+1] ^= (t1 << 8);
+        M[i+2] ^= t0;
+        M[i+3] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 4; i++)
+    {
+        uint64_t t0 = ((M[i  ] >> 16) ^ M[i+ 4]) & even_2bytes;
+        uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+        M[i   ] ^= t0 << 16;
+        M[i+ 8] ^= t1 << 16;
+        M[i+ 4] ^= t0;
+        M[i+12] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 8; i++)
+    {
+        uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; 
+        M[i  ] ^= t << 32;
+        M[i+8] ^= t;
+    }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    
+    const uint64_t *VtL = _VtL;
+    int bits_to_shift = 0;
+    int words_to_shift = 0;
+    uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+    size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+    const uint64_t *Mi, *Mj;
+
+    for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+        for (int j = PARAM_k(p) - 1; j >= i; --j) {
+            // add the M_i and M_j to A, shifted "down" by l positions
+            Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+            for (int c = 0; c < PARAM_o(p); c++) {
+                for (int k = 0; k < PARAM_m(p)/16; k++)
+                {
+                    A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+                    if(bits_to_shift > 0){
+                        A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                    }
+                }
+            }
+
+            if (i != j) {
+                Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+                for (int c = 0; c < PARAM_o(p); c++) {
+                    for (int k = 0; k < PARAM_m(p)/16; k++)
+                    {
+                        A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+                        if(bits_to_shift > 0){
+                            A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                        }
+                    }
+                }
+            }
+
+            bits_to_shift += 4;
+            if(bits_to_shift == 64){
+                words_to_shift ++;
+                bits_to_shift = 0;
+            }
+        }
+    }
+
+    for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+    {
+        transpose_16x16_nibbles(A + c);
+    }
+
+    unsigned char tab[F_TAIL_LEN*4] = {0};
+    for (size_t i = 0; i < F_TAIL_LEN; i++)
+    {
+        tab[4*i]   = mul_f(PARAM_f_tail(p)[i],1);
+        tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+        tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+        tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+    }
+
+    uint64_t low_bit_in_nibble = 0x1111111111111111;
+    
+    for (size_t c = 0; c < A_width; c+= 16)
+    {
+        for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+        {
+            size_t pos = (r/16)*A_width + c + (r%16);
+            uint64_t t0 =  A[pos]       & low_bit_in_nibble;
+            uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+            uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+            uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+            for (size_t t = 0; t < F_TAIL_LEN; t++)
+            {
+                A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+            }
+        }
+    }
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) 
+        A[i] = BSWAP64(A[i]);
+#endif
+
+    for (int r = 0; r < PARAM_m(p); r+=16)
+    {
+        for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+        {
+            for (size_t i = 0; i < 16; i++)
+            {
+                decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+            }   
+        }
+    }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+    int ret = 0;
+
+    ret = mayo_keypair_compact(p, pk, sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+    unsigned char y[M_MAX];                    // secret data
+    unsigned char salt[SALT_BYTES_MAX];        // not secret data
+    unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+             Vdec[N_MINUS_O_MAX * K_MAX];                 // secret data
+    unsigned char A[M_MAX * (K_MAX * O_MAX + 1)];   // secret data
+    unsigned char x[K_MAX * N_MAX];                       // not secret data
+    unsigned char r[K_MAX * O_MAX + 1] = { 0 };           // secret data
+    unsigned char s[K_MAX * N_MAX];                       // not secret data
+    const unsigned char *seed_sk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+    alignas(32) sk_t sk;                    // secret data
+    unsigned char Ox[N_MINUS_O_MAX];        // secret data
+    // unsigned char Mdigest[DIGEST_BYTES];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+    unsigned char *ctrbyte;
+    unsigned char *vi;
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_v_bytes = PARAM_v_bytes(p);
+    const int param_r_bytes = PARAM_r_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_A_cols = PARAM_A_cols(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    ret = mayo_expand_sk(p, csk, &sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+    seed_sk = csk;
+    decode(sk.o, O, (param_n - param_o) * param_o);
+
+    // hash message
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    uint64_t *P1 = sk.p;
+    uint64_t *L  = P1 + (param_P1_bytes/8);
+    alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        L[i] = BSWAP64(L[i]);
+    }
+#endif
+
+    // choose the randomizer
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(tmp + param_digest_bytes, param_salt_bytes);
+    #else
+    if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // hashing to salt
+    memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+           param_sk_seed_bytes);
+    shake256(salt, param_salt_bytes, tmp,
+             param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+    // hashing to t
+    memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+    ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+    shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+    decode(tenc, t, param_m); // may not be necessary
+
+    for (int ctr = 0; ctr <= 255; ++ctr) {
+        *ctrbyte = (unsigned char)ctr;
+
+        shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+                 param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+        // decode the v_i vectors
+        for (int i = 0; i <= param_k - 1; ++i) {
+            decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+                   param_n - param_o);
+        }
+
+        // compute all the V * L matrices.
+        // compute all the V * P1 * V^T matrices.
+        alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+        V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+        compute_rhs(p, Y, t, y);
+        compute_A(p, Mtmp, A);
+
+        decode(V + param_k * param_v_bytes, r,
+               param_k *
+               param_o);
+        if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+            break;
+        } else {
+            memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+        }
+    }
+
+    // s is already 0
+    // TODO: optimize this?
+    for (int i = 0; i <= param_k - 1; ++i) {
+        vi = Vdec + i * (param_n - param_o);
+        mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+        mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+        memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+    }
+    encode(s, sig, param_n * param_k);
+    memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+    *siglen = param_sig_bytes;
+err:
+    mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+    mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+    mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+    mayo_secure_clear(r, K_MAX * O_MAX + 1);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(&sk, sizeof(sk_t));
+    mayo_secure_clear(Ox, N_MINUS_O_MAX);
+    mayo_secure_clear(tmp,
+                      DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+    return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    size_t siglen = param_sig_bytes;
+    ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+    if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+        goto err;
+
+    memmove(sm + param_sig_bytes, m, mlen);
+    *smlen = siglen + mlen;
+err:
+    return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk) {
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    if (smlen < (size_t)param_sig_bytes) {
+        return MAYO_ERR;
+    }
+    int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+                             pk);
+
+    if (result == MAYO_OK) {
+        *mlen = smlen - param_sig_bytes;
+        memmove(m, sm + param_sig_bytes, *mlen);
+    }
+
+    return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char *seed_sk = csk;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+    alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+    unsigned char *seed_pk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    // seed_sk $←- B^(sk_seed bytes)
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(seed_sk, param_sk_seed_bytes);
+    #else
+    if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // S ← shake256(seedsk, pk seed bytes + O bytes)
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    // seed_pk ← s[0 : pk_seed_bytes]
+    seed_pk = S;
+
+    // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+    decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+
+    int m_legs = param_m / 32;
+
+    uint64_t *P1 = P;
+    uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+    // compute P3 = O^t * (P1*O + P2)
+    Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+    // store seed_pk in cpk
+    memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+    alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+    // compute Upper(P3) and store in cpk
+    m_upper(m_legs, P3, P3_upper, param_o);
+    
+    memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+    return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *pk) {
+    #ifdef MAYO_VARIANT
+    (void)p;
+    #endif
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+    pk += param_P1_bytes + param_P2_bytes;
+    memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+    return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *sk) {
+    int ret = MAYO_OK;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    uint64_t *P = sk->p;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_v(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    const unsigned char *seed_sk = csk;
+    unsigned char *seed_pk = S;
+
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    decode(S + param_pk_seed_bytes, O,
+           param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+    uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+    
+    uint64_t *P1 = P;
+    // compute L_i = (P1 + P1^t)*O + P2
+    uint64_t *L = P2;
+    P1P1t_times_O(p, P1, O, L);
+
+    // write to sk
+    memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+
+    mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *cpk) {
+    unsigned char tEnc[M_BYTES_MAX];
+    unsigned char t[M_MAX];
+    unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+    unsigned char s[K_MAX * N_MAX];
+    alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+    if (ret != MAYO_OK) {
+        return MAYO_ERR;
+    }
+
+    uint64_t *P1 = pk;
+    uint64_t *P2 = pk + (param_P1_bytes / 8);
+    uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        P2[i] = BSWAP64(P2[i]);
+    }
+    for (int i = 0; i < param_P3_bytes / 8; ++i) {
+        P3[i] = BSWAP64(P3[i]);
+    }
+#endif
+
+    // hash m
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    // compute t
+    memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+           param_salt_bytes);
+    shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+    decode(tEnc, t, param_m);
+
+    // decode s
+    decode(sig, s, param_k * param_n);
+
+    // Compute S*P*S^T
+    alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+    m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+                             param_v, param_o, param_k, SPS);
+
+    // combine the vectors in SPS and reduce mod f(X)
+    compute_rhs(p, SPS, y, y);
+    
+    if (memcmp(y, t, param_m) == 0) {
+        return MAYO_OK; // good signature
+    }
+    return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/mayo.h b/src/sig/mayo/pqmayo_mayo-2_opt/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64                                                              \
+  { 8, 0, 2, 8, 0 } // f(z) =  z^64         + x^3*z^3 + x*z^2         + x^3
+#define F_TAIL_96                                                              \
+  { 2, 2, 0, 2, 0 } // f(z) =  z^96         +   x*z^3         + x*z   + x
+#define F_TAIL_128                                                             \
+  { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3         + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+    int m;
+    int n;
+    int o;
+    int k;
+    int q;
+    const unsigned char *f_tail;
+    int m_bytes;
+    int O_bytes;
+    int v_bytes;
+    int r_bytes;
+    int R_bytes;
+    int P1_bytes;
+    int P2_bytes;
+    int P3_bytes;
+    int csk_bytes;
+    int esk_bytes;
+    int cpk_bytes;
+    int epk_bytes;
+    int sig_bytes;
+    int salt_bytes;
+    int sk_seed_bytes;
+    int digest_bytes;
+    int pk_seed_bytes;
+    const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+    uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+    uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ * 
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/mem.h b/src/sig/mayo/pqmayo_mayo-2_opt/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+    int32_t diff = b - a;
+    return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+    int64_t diff = ((int64_t) b) - ((int64_t) a);
+    return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+    return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+    return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+    return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include <oqs/common.h>
+/**
+ * Clears and frees allocated memory.
+ * 
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+    OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ * 
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+    OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/params.c b/src/sig/mayo/pqmayo_mayo-2_opt/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+  const mayo_params_t nm = { \
+    .m = PARAM_JOIN2(nm, m), \
+    .n = PARAM_JOIN2(nm, n), \
+    .o = PARAM_JOIN2(nm, o), \
+    .k = PARAM_JOIN2(nm, k), \
+    .q = PARAM_JOIN2(nm, q), \
+    .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+    .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+    .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+    .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+    .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+    .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+    .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+    .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+    .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+    .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+    .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+    .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+    .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+    .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+    .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+    .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+    .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+    .name = #nm \
+  };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-2_opt/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-2_opt/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-2_opt/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+    // carryless multiply
+    unsigned char p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    unsigned char top_p = p & 0xf0;
+    unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+    return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+    // carryless multiply
+    uint64_t p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+    uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+    return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+    return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+    // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+    // 10, 4, 3, 8}; return table[a & 15];
+
+    unsigned char a2 = mul_f(a, a);
+    unsigned char a4 = mul_f(a2, a2);
+    unsigned char a8 = mul_f(a4, a4);
+    unsigned char a6 = mul_f(a2, a4);
+    unsigned char a14 = mul_f(a8, a6);
+
+    return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+                                    const unsigned char *b, int n, int m) {
+    unsigned char ret = 0;
+    for (int i = 0; i < n; ++i, b += m) {
+        ret = add_f(mul_f(a[i], *b), ret);
+    }
+    return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int colrow_ab, int row_a, int col_b) {
+    for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+        for (int j = 0; j < col_b; ++j, ++c) {
+            *c = lincomb(a, b + j, colrow_ab, col_b);
+        }
+    }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int m, int n) {
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+        }
+    }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+                                 uint64_t *out) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        out[i] = in[i];
+    }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+                                uint64_t *acc) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        acc[i] ^= in[i];
+    }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+    uint32_t a_msb;
+    uint32_t a32 = a;
+    uint32_t b32 = b;
+    uint32_t r32 = a32 * (b32 & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 1) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 2) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 3) & 1);
+
+    return r32;
+
+}
+ 
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/LICENSE b/src/sig/mayo/pqmayo_mayo-3_avx2/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/NOTICE b/src/sig/mayo/pqmayo_mayo-3_avx2/NOTICE
new file mode 100644
index 0000000000..53da47c21c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/NOTICE
@@ -0,0 +1,13 @@
+Copyright 2023 the MAYO team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/aes_ctr.h b/src/sig/mayo/pqmayo_mayo-3_avx2/aes_ctr.h
new file mode 100644
index 0000000000..8e41c30f2c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/aes_ctr.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AESCTR_H
+#define AESCTR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+                   const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                      const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include <aes.h>
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen) {
+    (void) inputByteLen;
+    uint8_t iv[12] = { 0 };
+    aes128ctr_prf(output, outputByteLen, input, iv);
+    return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/api.c b/src/sig/mayo/pqmayo_mayo-3_avx2/api.c
new file mode 100644
index 0000000000..5c42eabc48
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_3
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+    return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk) {
+    return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk) {
+    return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk) {
+    return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk) {
+    if (siglen != CRYPTO_BYTES)
+        return -1;
+    return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/api.h b/src/sig/mayo/pqmayo_mayo-3_avx2/api.h
new file mode 100644
index 0000000000..b08c24704e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <mayo.h>
+
+#define CRYPTO_SECRETKEYBYTES 32
+#define CRYPTO_PUBLICKEYBYTES 2656
+#define CRYPTO_BYTES 577
+
+#define CRYPTO_ALGNAME "MAYO-3"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <arithmetic_common.h>
+#include <mem.h>
+#include <echelon_form.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+    mayo12_m_upper(m_legs, in, out, size);
+#else
+    int m_vecs_stored = 0;
+    for (int r = 0; r < size; r++) {
+        for (int c = r; c < size; c++) {
+            m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+            if (r != c) {
+                m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+            }
+            m_vecs_stored ++;
+        }
+    }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    (void) p;
+    mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    (void) p;   
+    mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    (void) p;   
+    mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+    #ifndef MAYO_VARIANT
+    const int m_legs = PARAM_m(p) / 32;
+    #else
+    (void) p;
+    #endif
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+    int 
+    bs_mat_entries_used = 0;
+    for (int r = 0; r < param_v; r++) {
+        for (int c = r; c < param_v; c++) {
+            if(c==r) {
+                bs_mat_entries_used += 1;
+                continue;
+            }
+            for (int k = 0; k < param_o; k += 1) {
+                
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+                vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k],  acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k],  acc + 6 * (r * param_o + k));
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k],  acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k],  acc + 8 * (r * param_o + k));
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k],  acc + 8 * (c * param_o + k));
+#else
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+        param_k, param_n - param_o, param_o);
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+    mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+        Y, param_k, param_n - param_o,
+        param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int m_legs = PARAM_m(p) / 32;
+    mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+    mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS) {
+    (void) m;
+#if MAYO_AVX
+    const int n = o + v;
+
+    /* Old approach which is constant time but doesn't have to be */
+    unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+    unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+    mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+    mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else 
+    NOT IMPLEMENTED
+#endif
+#else
+    const int n = o + v;
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+    mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+    mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+                           const unsigned char *y, const unsigned char *r,
+                           unsigned char *x, int k, int o, int m, int A_cols) {
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    unsigned char finished;
+    int col_upper_bound;
+    unsigned char correct_column;
+
+    // x <- r
+    for (int i = 0; i < k * o; i++) {
+        x[i] = r[i];
+    }
+
+    // compute Ar;
+    unsigned char Ar[M_MAX];
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+    }
+    mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+    // move y - Ar to last column of matrix A
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+    }
+
+    EF(A, m, k * o + 1);
+
+    // check if last row of A (excluding the last entry of y) is zero
+    unsigned char full_rank = 0;
+    for (int i = 0; i < A_cols - 1; i++) {
+        full_rank |= A[(m - 1) * A_cols + i];
+    }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+    if (full_rank == 0) {
+        return 0;
+    }
+
+    // back substitution in constant time
+    // the index of the first nonzero entry in each row is secret, which makes
+    // things less efficient
+
+    for (int row = m - 1; row >= 0; row--) {
+        finished = 0;
+        col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+        // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+        for (int col = row; col <= col_upper_bound; col++) {
+
+            // Compare two chars in constant time.
+            // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+            correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+            unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+            x[col] ^= u;
+
+            for (int i = 0; i < row; i += 8) {
+                uint64_t tmp = ( (uint64_t) A[ i    * A_cols + col] <<  0) ^ ( (uint64_t) A[(i+1) * A_cols + col] <<  8)
+                             ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+                             ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+                             ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+                tmp = mul_fx8(u, tmp);
+
+                A[ i    * A_cols + A_cols - 1] ^= (tmp      ) & 0xf;
+                A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+                A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+                A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+                A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+                A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+                A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+                A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+            }
+
+            finished = finished | correct_column;
+        }
+    }
+    return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <stdint.h>
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+    #include <shuffle_arithmetic_64.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+    #include <shuffle_arithmetic_96.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+    #include <shuffle_arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+    #include <arithmetic_64.h>
+    #include <arithmetic_96.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+    #include <arithmetic_96.h>
+    #include <arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+    #include <arithmetic_128.h>
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_128.h
new file mode 100644
index 0000000000..27b367e940
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_128.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+    out[6] = in[6];
+    out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+    acc[6] ^= in[6];
+    acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static 
+    inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_128(bins +  5 * 8, bins +  10 * 8);
+    m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+    m_vec_mul_add_x_inv_128(bins +  10 * 8, bins +  7 * 8);
+    m_vec_mul_add_x_128(bins + 12 * 8, bins +  6 * 8);
+    m_vec_mul_add_x_inv_128(bins +  7 * 8, bins +  14 * 8);
+    m_vec_mul_add_x_128(bins +  6 * 8, bins +  3 * 8);
+    m_vec_mul_add_x_inv_128(bins +  14 * 8, bins +  15 * 8);
+    m_vec_mul_add_x_128(bins +  3 * 8, bins +  8 * 8);
+    m_vec_mul_add_x_inv_128(bins +  15 * 8, bins +  13 * 8);
+    m_vec_mul_add_x_128(bins +  8 * 8, bins +  4 * 8);
+    m_vec_mul_add_x_inv_128(bins +  13 * 8, bins +  9 * 8);
+    m_vec_mul_add_x_128(bins +  4 * 8, bins +  2 * 8);
+    m_vec_mul_add_x_inv_128(bins +   9 * 8, bins +  1 * 8);
+    m_vec_mul_add_x_128(bins +  2 * 8, bins +  1 * 8);
+    vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_64.h
new file mode 100644
index 0000000000..9f7535c878
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_64.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+}
+
+static 
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_64(bins +  5 * 4, bins +  10 * 4);
+    m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+    m_vec_mul_add_x_inv_64(bins +  10 * 4, bins +  7 * 4);
+    m_vec_mul_add_x_64(bins + 12 * 4, bins +  6 * 4);
+    m_vec_mul_add_x_inv_64(bins +  7 * 4, bins +  14 * 4);
+    m_vec_mul_add_x_64(bins +  6 * 4, bins +  3 * 4);
+    m_vec_mul_add_x_inv_64(bins +  14 * 4, bins +  15 * 4);
+    m_vec_mul_add_x_64(bins +  3 * 4, bins +  8 * 4);
+    m_vec_mul_add_x_inv_64(bins +  15 * 4, bins +  13 * 4);
+    m_vec_mul_add_x_64(bins +  8 * 4, bins +  4 * 4);
+    m_vec_mul_add_x_inv_64(bins +  13 * 4, bins +  9 * 4);
+    m_vec_mul_add_x_64(bins +  4 * 4, bins +  2 * 4);
+    m_vec_mul_add_x_inv_64(bins +   9 * 4, bins +  1 * 4);
+    m_vec_mul_add_x_64(bins +  2 * 4, bins +  1 * 4);
+    vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_96.h
new file mode 100644
index 0000000000..86359679fb
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_96.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static 
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_96(bins +  5 * 6, bins +  10 * 6);
+    m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+    m_vec_mul_add_x_inv_96(bins +  10 * 6, bins +  7 * 6);
+    m_vec_mul_add_x_96(bins + 12 * 6, bins +  6 * 6);
+    m_vec_mul_add_x_inv_96(bins +  7 * 6, bins +  14 * 6);
+    m_vec_mul_add_x_96(bins +  6 * 6, bins +  3 * 6);
+    m_vec_mul_add_x_inv_96(bins +  14 * 6, bins +  15 * 6);
+    m_vec_mul_add_x_96(bins +  3 * 6, bins +  8 * 6);
+    m_vec_mul_add_x_inv_96(bins +  15 * 6, bins +  13 * 6);
+    m_vec_mul_add_x_96(bins +  8 * 6, bins +  4 * 6);
+    m_vec_mul_add_x_inv_96(bins +  13 * 6, bins +  9 * 6);
+    m_vec_mul_add_x_96(bins +  4 * 6, bins +  2 * 6);
+    m_vec_mul_add_x_inv_96(bins +   9 * 6, bins +  1 * 6);
+    m_vec_mul_add_x_96(bins +  2 * 6, bins +  1 * 6);
+    vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_common.h
new file mode 100644
index 0000000000..eeb13dc0bd
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/arithmetic_common.h
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include <stdint.h>
+#include <immintrin.h>
+
+#define K_OVER_2 ((K_MAX+1)/2)
+
+static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d,
+    0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09,
+    0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01
+};
+
+//
+// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper!
+//
+static inline __m256i tbl32_gf16_multab2( uint8_t b ) {
+
+    __m256i bx = _mm256_set1_epi16( b & 0xf );
+    __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+    const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+    const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+    const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+    const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+    __m256i mask_1  = _mm256_set1_epi16(1);
+    __m256i mask_4  = _mm256_set1_epi16(4);
+    __m256i mask_0  = _mm256_setzero_si256();
+
+    return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+           ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+           ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+           ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) {
+    return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f);
+}
+
+static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) {
+    __m256i multab_l = tbl32_gf16_multab2( b );
+    __m256i multab_h = _mm256_slli_epi16( multab_l, 4 );
+
+    return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) );
+}
+
+static 
+inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){
+    // build multiplication tables 
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        for (size_t c = 0; c < O_MAX; c+=2)
+        {
+            O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4);
+        }
+    }
+}
+
+
+static 
+inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){
+    // build multiplication tables 
+    size_t r;
+    for (size_t c = 0; c < V_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            V_multabs[K_OVER_2*c +  r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4);
+        }
+#if K_MAX % 2 == 1
+        V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]);
+#endif
+    }
+}
+
+static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = {
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, 
+    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+    0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, 
+    0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d,
+    0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, 
+    0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02,
+    0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, 
+    0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09,
+    0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, 
+    0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06,
+    0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, 
+    0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04,
+    0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, 
+    0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b,
+    0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, 
+    0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01,
+    0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, 
+    0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e,
+    0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, 
+    0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c,
+    0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, 
+    0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03,
+    0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, 
+    0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08,
+    0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, 
+    0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07,
+    0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, 
+    0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05,
+    0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a, 
+    0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a};
+
+
+static 
+inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) {
+    size_t r;
+    for (size_t c = 0; c < V_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            S1_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])) 
+                                          ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4);
+        }
+#if K_MAX % 2 == 1
+        S1_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c]));
+#endif
+    }
+}
+
+static 
+inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) {
+    // build multiplication tables 
+    size_t r;
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            S2_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) 
+                                          ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4);
+        }
+#if K_MAX % 2 == 1
+        S2_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ;
+#endif
+    }
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    uint64_t a_msb;
+    uint64_t a64 = a;
+    uint64_t b32 = b;
+    uint64_t r64 = a64 * (b32 & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 1) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 2) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 3) & 1);
+
+    return r64;
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form.h
new file mode 100644
index 0000000000..fa69de0ab2
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <immintrin.h>
+#include <stdint.h>
+
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+
+//
+// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/.
+//
+static inline __m256i tbl32_gf16_multab( uint8_t b ) {
+    __m256i bx = _mm256_set1_epi16( b & 0xf );
+    __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+    const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+    const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+    const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+    const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+    __m256i mask_1  = _mm256_set1_epi16(1);
+    __m256i mask_4  = _mm256_set1_epi16(4);
+    __m256i mask_0  = _mm256_setzero_si256();
+
+    return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+           ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+           ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+           ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+/* put matrix in row echelon form with ones on first nonzero entries in constant time*/
+static inline void EF(unsigned char *A, int _nrows, int _ncols) {
+
+    (void) _nrows;
+    (void) _ncols;
+
+    #define nrows M_MAX
+    #define ncols (K_MAX * O_MAX + 1)
+
+    #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32)
+    #define MAX_COLS (AVX_REGS_PER_ROW * 32)
+
+    __m256i _pivot_row[AVX_REGS_PER_ROW];
+    __m256i A_avx[AVX_REGS_PER_ROW* M_MAX];
+
+    unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row;
+    unsigned char* A_bytes = (unsigned char*) A_avx;
+
+    // load A in the tail of AVX2 registers
+    for (int i = 0; i < nrows; i++) {
+        for (int j = 0; j < ncols; j++)
+        {
+            A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ];
+        }
+    }
+
+    // pivot row is secret, pivot col is not
+    unsigned char inverse;
+    int pivot_row = 0;
+    int pivot_col = MAYO_MAX(MAX_COLS - ncols,0);
+    for (; pivot_col < MAX_COLS-128; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-96; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-64; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-32; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+
+    // write the matrix A back
+    for (int i = 0; i < nrows; i++) {
+        for (int j = 0; j < ncols; j++) {
+            A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j];
+        }
+    }
+    mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32);
+    mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form_loop.h
new file mode 100644
index 0000000000..b8b29741c4
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/echelon_form_loop.h
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+
+int pivot_col_rounded = pivot_col/32;
+
+int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS);
+int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols);
+/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/
+
+/* zero out pivot row */
+for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) {
+    _pivot_row[i] = _mm256_set1_epi8(0);
+}
+
+/* try to get a pivot row in constant time */
+unsigned char pivot = 0;
+uint32_t pivot_is_zero = -1;
+for (int row = pivot_row_lower_bound;
+        row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+    uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row);
+    uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row);
+    __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) );
+    for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+        _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j];
+    }
+    pivot = pivot_row_bytes[pivot_col];
+    pivot_is_zero = ~ct_compare_32((int) pivot, 0);
+}
+
+/* multiply pivot row by inverse of pivot */
+inverse = inverse_f(pivot);
+__m256i inverse_multab = tbl32_gf16_multab(inverse);
+
+for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+    _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]);
+}
+
+/* conditionally write pivot row to the correct row, if there is a nonzero pivot */
+/* eliminate entries below pivot */
+for (int row = pivot_row_lower_bound; row < nrows; row++) {
+    unsigned char below_pivot =  (unsigned char) (ct_is_greater_than(row, pivot_row));
+    unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col];
+
+    __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim);
+    if (row <= pivot_row_upper_bound) {
+        __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero);
+        for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) { 
+            A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^
+                                                    _mm256_shuffle_epi8(multab, _pivot_row[col]);
+        }
+    } else {
+        for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+            A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]);
+        }
+    }
+}
+
+pivot_row += (-(int32_t)(~pivot_is_zero));
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mem.h>
+#include <mayo.h>
+#include <randombytes.h>
+#include <aes_ctr.h>
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <fips202.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+    int i;
+    for (i = 0; i < mdeclen / 2; ++i) {
+        *mdec++ = m[i] & 0xf;
+        *mdec++ = m[i] >> 4;
+    }
+
+    if (mdeclen % 2 == 1) {
+        *mdec++ = m[i] & 0x0f;
+    }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+    int i;
+    for (i = 0; i < mlen / 2; ++i, m += 2) {
+        menc[i] = (*m) | (*(m + 1) << 4);
+    }
+
+    if (mlen % 2 == 1) {
+        menc[i] = (*m);
+    }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+
+    const uint64_t *vPv = _vPv;
+    uint64_t temp[M_MAX/16] = {0};
+    unsigned char *temp_bytes = (unsigned char *) temp;
+    int k = 0;
+    for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+        for (int j = i; j < PARAM_k(p); j++) {
+            // multiply by X (shift up 4 bits)
+            unsigned char top = temp[k] >> 60;
+            temp[k] <<= 4;
+            k--;
+            for(; k>=0; k--){
+                temp[k+1] ^= temp[k] >> 60;
+                temp[k] <<= 4;
+            }
+            // reduce mod f(X)
+            for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+                if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+                }
+                else {
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+                }
+            }
+
+            // extract from vPv and add
+            for(k=0; k < PARAM_m(p)/16; k ++){
+                temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+            }
+            k--;
+        }
+    }
+
+    // add to y
+    for (int i = 0; i < PARAM_m(p); i+=2)
+    {
+#ifdef TARGET_BIG_ENDIAN
+        y[i]   = t[i]   ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        y[i]   = t[i]   ^ (temp_bytes[i/2] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+    }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+    static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+    static const uint64_t even_bytes   = 0x00ff00ff00ff00ff;
+    static const uint64_t even_2bytes  = 0x0000ffff0000ffff;
+    static const uint64_t even_half    = 0x00000000ffffffff;
+
+    for (size_t i = 0; i < 16; i+=2)
+    {
+        uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; 
+        M[i  ] ^= t << 4;
+        M[i+1] ^= t;
+    }
+
+    for (size_t i = 0; i < 16; i+=4)
+    {
+        uint64_t t0 = ((M[i  ] >> 8) ^ M[i+2]) & even_bytes; 
+        uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; 
+        M[i  ] ^= (t0 << 8);
+        M[i+1] ^= (t1 << 8);
+        M[i+2] ^= t0;
+        M[i+3] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 4; i++)
+    {
+        uint64_t t0 = ((M[i  ] >> 16) ^ M[i+ 4]) & even_2bytes;
+        uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+        M[i   ] ^= t0 << 16;
+        M[i+ 8] ^= t1 << 16;
+        M[i+ 4] ^= t0;
+        M[i+12] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 8; i++)
+    {
+        uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; 
+        M[i  ] ^= t << 32;
+        M[i+8] ^= t;
+    }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    
+    const uint64_t *VtL = _VtL;
+    int bits_to_shift = 0;
+    int words_to_shift = 0;
+    uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+    size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+    const uint64_t *Mi, *Mj;
+
+    for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+        for (int j = PARAM_k(p) - 1; j >= i; --j) {
+            // add the M_i and M_j to A, shifted "down" by l positions
+            Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+            for (int c = 0; c < PARAM_o(p); c++) {
+                for (int k = 0; k < PARAM_m(p)/16; k++)
+                {
+                    A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+                    if(bits_to_shift > 0){
+                        A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                    }
+                }
+            }
+
+            if (i != j) {
+                Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+                for (int c = 0; c < PARAM_o(p); c++) {
+                    for (int k = 0; k < PARAM_m(p)/16; k++)
+                    {
+                        A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+                        if(bits_to_shift > 0){
+                            A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                        }
+                    }
+                }
+            }
+
+            bits_to_shift += 4;
+            if(bits_to_shift == 64){
+                words_to_shift ++;
+                bits_to_shift = 0;
+            }
+        }
+    }
+
+    for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+    {
+        transpose_16x16_nibbles(A + c);
+    }
+
+    unsigned char tab[F_TAIL_LEN*4] = {0};
+    for (size_t i = 0; i < F_TAIL_LEN; i++)
+    {
+        tab[4*i]   = mul_f(PARAM_f_tail(p)[i],1);
+        tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+        tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+        tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+    }
+
+    uint64_t low_bit_in_nibble = 0x1111111111111111;
+    
+    for (size_t c = 0; c < A_width; c+= 16)
+    {
+        for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+        {
+            size_t pos = (r/16)*A_width + c + (r%16);
+            uint64_t t0 =  A[pos]       & low_bit_in_nibble;
+            uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+            uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+            uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+            for (size_t t = 0; t < F_TAIL_LEN; t++)
+            {
+                A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+            }
+        }
+    }
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) 
+        A[i] = BSWAP64(A[i]);
+#endif
+
+    for (int r = 0; r < PARAM_m(p); r+=16)
+    {
+        for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+        {
+            for (size_t i = 0; i < 16; i++)
+            {
+                decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+            }   
+        }
+    }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+    int ret = 0;
+
+    ret = mayo_keypair_compact(p, pk, sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+    unsigned char y[M_MAX];                    // secret data
+    unsigned char salt[SALT_BYTES_MAX];        // not secret data
+    unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+             Vdec[N_MINUS_O_MAX * K_MAX];                 // secret data
+    unsigned char A[M_MAX * (K_MAX * O_MAX + 1)];   // secret data
+    unsigned char x[K_MAX * N_MAX];                       // not secret data
+    unsigned char r[K_MAX * O_MAX + 1] = { 0 };           // secret data
+    unsigned char s[K_MAX * N_MAX];                       // not secret data
+    const unsigned char *seed_sk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+    alignas(32) sk_t sk;                    // secret data
+    unsigned char Ox[N_MINUS_O_MAX];        // secret data
+    // unsigned char Mdigest[DIGEST_BYTES];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+    unsigned char *ctrbyte;
+    unsigned char *vi;
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_v_bytes = PARAM_v_bytes(p);
+    const int param_r_bytes = PARAM_r_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_A_cols = PARAM_A_cols(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    ret = mayo_expand_sk(p, csk, &sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+    seed_sk = csk;
+    decode(sk.o, O, (param_n - param_o) * param_o);
+
+    // hash message
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    uint64_t *P1 = sk.p;
+    uint64_t *L  = P1 + (param_P1_bytes/8);
+    alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        L[i] = BSWAP64(L[i]);
+    }
+#endif
+
+    // choose the randomizer
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(tmp + param_digest_bytes, param_salt_bytes);
+    #else
+    if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // hashing to salt
+    memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+           param_sk_seed_bytes);
+    shake256(salt, param_salt_bytes, tmp,
+             param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+    // hashing to t
+    memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+    ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+    shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+    decode(tenc, t, param_m); // may not be necessary
+
+    for (int ctr = 0; ctr <= 255; ++ctr) {
+        *ctrbyte = (unsigned char)ctr;
+
+        shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+                 param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+        // decode the v_i vectors
+        for (int i = 0; i <= param_k - 1; ++i) {
+            decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+                   param_n - param_o);
+        }
+
+        // compute all the V * L matrices.
+        // compute all the V * P1 * V^T matrices.
+        alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+        V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+        compute_rhs(p, Y, t, y);
+        compute_A(p, Mtmp, A);
+
+        decode(V + param_k * param_v_bytes, r,
+               param_k *
+               param_o);
+        if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+            break;
+        } else {
+            memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+        }
+    }
+
+    // s is already 0
+    // TODO: optimize this?
+    for (int i = 0; i <= param_k - 1; ++i) {
+        vi = Vdec + i * (param_n - param_o);
+        mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+        mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+        memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+    }
+    encode(s, sig, param_n * param_k);
+    memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+    *siglen = param_sig_bytes;
+err:
+    mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+    mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+    mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+    mayo_secure_clear(r, K_MAX * O_MAX + 1);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(&sk, sizeof(sk_t));
+    mayo_secure_clear(Ox, N_MINUS_O_MAX);
+    mayo_secure_clear(tmp,
+                      DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+    return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    size_t siglen = param_sig_bytes;
+    ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+    if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+        goto err;
+
+    memmove(sm + param_sig_bytes, m, mlen);
+    *smlen = siglen + mlen;
+err:
+    return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk) {
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    if (smlen < (size_t)param_sig_bytes) {
+        return MAYO_ERR;
+    }
+    int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+                             pk);
+
+    if (result == MAYO_OK) {
+        *mlen = smlen - param_sig_bytes;
+        memmove(m, sm + param_sig_bytes, *mlen);
+    }
+
+    return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char *seed_sk = csk;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+    alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+    unsigned char *seed_pk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    // seed_sk $←- B^(sk_seed bytes)
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(seed_sk, param_sk_seed_bytes);
+    #else
+    if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // S ← shake256(seedsk, pk seed bytes + O bytes)
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    // seed_pk ← s[0 : pk_seed_bytes]
+    seed_pk = S;
+
+    // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+    decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+
+    int m_legs = param_m / 32;
+
+    uint64_t *P1 = P;
+    uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+    // compute P3 = O^t * (P1*O + P2)
+    Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+    // store seed_pk in cpk
+    memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+    alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+    // compute Upper(P3) and store in cpk
+    m_upper(m_legs, P3, P3_upper, param_o);
+    
+    memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+    return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *pk) {
+    #ifdef MAYO_VARIANT
+    (void)p;
+    #endif
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+    pk += param_P1_bytes + param_P2_bytes;
+    memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+    return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *sk) {
+    int ret = MAYO_OK;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    uint64_t *P = sk->p;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_v(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    const unsigned char *seed_sk = csk;
+    unsigned char *seed_pk = S;
+
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    decode(S + param_pk_seed_bytes, O,
+           param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+    uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+    
+    uint64_t *P1 = P;
+    // compute L_i = (P1 + P1^t)*O + P2
+    uint64_t *L = P2;
+    P1P1t_times_O(p, P1, O, L);
+
+    // write to sk
+    memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+
+    mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *cpk) {
+    unsigned char tEnc[M_BYTES_MAX];
+    unsigned char t[M_MAX];
+    unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+    unsigned char s[K_MAX * N_MAX];
+    alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+    if (ret != MAYO_OK) {
+        return MAYO_ERR;
+    }
+
+    uint64_t *P1 = pk;
+    uint64_t *P2 = pk + (param_P1_bytes / 8);
+    uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        P2[i] = BSWAP64(P2[i]);
+    }
+    for (int i = 0; i < param_P3_bytes / 8; ++i) {
+        P3[i] = BSWAP64(P3[i]);
+    }
+#endif
+
+    // hash m
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    // compute t
+    memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+           param_salt_bytes);
+    shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+    decode(tEnc, t, param_m);
+
+    // decode s
+    decode(sig, s, param_k * param_n);
+
+    // Compute S*P*S^T
+    alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+    m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+                             param_v, param_o, param_k, SPS);
+
+    // combine the vectors in SPS and reduce mod f(X)
+    compute_rhs(p, SPS, y, y);
+    
+    if (memcmp(y, t, param_m) == 0) {
+        return MAYO_OK; // good signature
+    }
+    return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64                                                              \
+  { 8, 0, 2, 8, 0 } // f(z) =  z^64         + x^3*z^3 + x*z^2         + x^3
+#define F_TAIL_96                                                              \
+  { 2, 2, 0, 2, 0 } // f(z) =  z^96         +   x*z^3         + x*z   + x
+#define F_TAIL_128                                                             \
+  { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3         + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+    int m;
+    int n;
+    int o;
+    int k;
+    int q;
+    const unsigned char *f_tail;
+    int m_bytes;
+    int O_bytes;
+    int v_bytes;
+    int r_bytes;
+    int R_bytes;
+    int P1_bytes;
+    int P2_bytes;
+    int P3_bytes;
+    int csk_bytes;
+    int esk_bytes;
+    int cpk_bytes;
+    int epk_bytes;
+    int sig_bytes;
+    int salt_bytes;
+    int sk_seed_bytes;
+    int digest_bytes;
+    int pk_seed_bytes;
+    const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+    uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+    uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ * 
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/mem.h b/src/sig/mayo/pqmayo_mayo-3_avx2/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+    int32_t diff = b - a;
+    return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+    int64_t diff = ((int64_t) b) - ((int64_t) a);
+    return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+    return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+    return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+    return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include <oqs/common.h>
+/**
+ * Clears and frees allocated memory.
+ * 
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+    OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ * 
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+    OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/params.c b/src/sig/mayo/pqmayo_mayo-3_avx2/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+  const mayo_params_t nm = { \
+    .m = PARAM_JOIN2(nm, m), \
+    .n = PARAM_JOIN2(nm, n), \
+    .o = PARAM_JOIN2(nm, o), \
+    .k = PARAM_JOIN2(nm, k), \
+    .q = PARAM_JOIN2(nm, q), \
+    .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+    .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+    .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+    .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+    .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+    .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+    .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+    .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+    .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+    .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+    .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+    .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+    .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+    .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+    .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+    .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+    .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+    .name = #nm \
+  };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_128.h
new file mode 100644
index 0000000000..27b416adce
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_128.h
@@ -0,0 +1,524 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_128_H
+#define SHUFFLE_ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            cols_used ++;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); 
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[(2*r*O_MAX) + 2*k]     ^= temp[2*k]     ^ _mm256_slli_epi16(t0,4);
+            acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+            acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0;
+            acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1;
+        }
+    }
+}
+
+static 
+inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+    const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c + 1]  ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(k*O_MAX) + 2*c]      ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*O_MAX) + 2*c]     ^= temp[2*k+1] ^ t0;
+            acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1;
+        }
+    }
+}
+
+
+static
+inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        cols_used += 1;
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            pos += (V_MAX -c - 1);
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k    ));
+            __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1));
+            __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2));
+            __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3));
+
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k),     acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0);
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1);
+        }
+    }
+}
+
+
+static 
+inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *L = (__m256i *) _L;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c]     ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c + 1]     ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+        acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+static 
+inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+static 
+inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *Pv = (__m256i *) _Pv;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+
+        // P1 * S1
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            P1_cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // P2 * S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P3 = (__m256i *) _P3;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+static
+inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+    const __m256i *PS2 = (__m256i *) _PS2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_64.h
new file mode 100644
index 0000000000..defff86f8f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_64.h
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_64_H
+#define SHUFFLE_ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+    }
+}
+
+
+static 
+inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*O_MAX) + c]     ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+        }
+    }
+}
+
+static
+inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        cols_used += 1;
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + pos);
+            pos += (V_MAX -c - 1);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k    ));
+            __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1));
+
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256(acc + (r*O_MAX + k    ), acc0 ^ temp[k  ] ^ _mm256_slli_epi16(t,4));
+            _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t);
+        }
+    }
+}
+
+
+static 
+inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *L = (__m256i *) _L;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+static 
+inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *Pv = (__m256i *) _Pv;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+static 
+inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular
+// same as mayo_12_P1_times_Vt_avx2
+static
+inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+    const __m256i *PS2 = (__m256i *) _PS2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P2 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+
+        // P1 * S1
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            P1_cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // P2 * S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P3 = (__m256i *) _P3;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P3 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+static inline
+void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+    (void) size;
+    int m_vecs_stored = 0;
+
+    for (int r = 0; r < O_MAX; ++r) {
+        const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r));
+        __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+        _out[0] = _in[0];
+        m_vecs_stored++;
+        for (int c = r + 1; c < O_MAX; ++c) {
+            const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c));
+            const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r));
+            _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+            _out[0] = _in2[0] ^ _in3[0];
+            m_vecs_stored++;
+        }
+    }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_96.h
new file mode 100644
index 0000000000..9b3a69d567
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/shuffle_arithmetic_96.h
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_96_H
+#define SHUFFLE_ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){
+
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static 
+inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){
+    
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static
+inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        cols_used ++;
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            pos += (V_MAX -c - 1);
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static 
+inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){
+
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static 
+inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){
+      const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static 
+inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        // P1 times S1
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            P1_cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // P2 times S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static
+inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){
+    mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc);
+}
+
+static
+inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-3_avx2/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_avx2/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+    // carryless multiply
+    unsigned char p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    unsigned char top_p = p & 0xf0;
+    unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+    return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+    // carryless multiply
+    uint64_t p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+    uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+    return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+    return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+    // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+    // 10, 4, 3, 8}; return table[a & 15];
+
+    unsigned char a2 = mul_f(a, a);
+    unsigned char a4 = mul_f(a2, a2);
+    unsigned char a8 = mul_f(a4, a4);
+    unsigned char a6 = mul_f(a2, a4);
+    unsigned char a14 = mul_f(a8, a6);
+
+    return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+                                    const unsigned char *b, int n, int m) {
+    unsigned char ret = 0;
+    for (int i = 0; i < n; ++i, b += m) {
+        ret = add_f(mul_f(a[i], *b), ret);
+    }
+    return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int colrow_ab, int row_a, int col_b) {
+    for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+        for (int j = 0; j < col_b; ++j, ++c) {
+            *c = lincomb(a, b + j, colrow_ab, col_b);
+        }
+    }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int m, int n) {
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+        }
+    }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+                                 uint64_t *out) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        out[i] = in[i];
+    }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+                                uint64_t *acc) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        acc[i] ^= in[i];
+    }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+    uint32_t a_msb;
+    uint32_t a32 = a;
+    uint32_t b32 = b;
+    uint32_t r32 = a32 * (b32 & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 1) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 2) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 3) & 1);
+
+    return r32;
+
+}
+ 
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/LICENSE b/src/sig/mayo/pqmayo_mayo-3_opt/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/NOTICE b/src/sig/mayo/pqmayo_mayo-3_opt/NOTICE
new file mode 100644
index 0000000000..53da47c21c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/NOTICE
@@ -0,0 +1,13 @@
+Copyright 2023 the MAYO team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/aes_ctr.h b/src/sig/mayo/pqmayo_mayo-3_opt/aes_ctr.h
new file mode 100644
index 0000000000..8e41c30f2c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/aes_ctr.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AESCTR_H
+#define AESCTR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+                   const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                      const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include <aes.h>
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen) {
+    (void) inputByteLen;
+    uint8_t iv[12] = { 0 };
+    aes128ctr_prf(output, outputByteLen, input, iv);
+    return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/api.c b/src/sig/mayo/pqmayo_mayo-3_opt/api.c
new file mode 100644
index 0000000000..5c42eabc48
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_3
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+    return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk) {
+    return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk) {
+    return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk) {
+    return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk) {
+    if (siglen != CRYPTO_BYTES)
+        return -1;
+    return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/api.h b/src/sig/mayo/pqmayo_mayo-3_opt/api.h
new file mode 100644
index 0000000000..b08c24704e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <mayo.h>
+
+#define CRYPTO_SECRETKEYBYTES 32
+#define CRYPTO_PUBLICKEYBYTES 2656
+#define CRYPTO_BYTES 577
+
+#define CRYPTO_ALGNAME "MAYO-3"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <arithmetic_common.h>
+#include <mem.h>
+#include <echelon_form.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+    mayo12_m_upper(m_legs, in, out, size);
+#else
+    int m_vecs_stored = 0;
+    for (int r = 0; r < size; r++) {
+        for (int c = r; c < size; c++) {
+            m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+            if (r != c) {
+                m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+            }
+            m_vecs_stored ++;
+        }
+    }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    (void) p;
+    mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    (void) p;   
+    mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    (void) p;   
+    mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+    #ifndef MAYO_VARIANT
+    const int m_legs = PARAM_m(p) / 32;
+    #else
+    (void) p;
+    #endif
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+    int 
+    bs_mat_entries_used = 0;
+    for (int r = 0; r < param_v; r++) {
+        for (int c = r; c < param_v; c++) {
+            if(c==r) {
+                bs_mat_entries_used += 1;
+                continue;
+            }
+            for (int k = 0; k < param_o; k += 1) {
+                
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+                vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k],  acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k],  acc + 6 * (r * param_o + k));
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k],  acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k],  acc + 8 * (r * param_o + k));
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k],  acc + 8 * (c * param_o + k));
+#else
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+        param_k, param_n - param_o, param_o);
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+    mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+        Y, param_k, param_n - param_o,
+        param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int m_legs = PARAM_m(p) / 32;
+    mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+    mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS) {
+    (void) m;
+#if MAYO_AVX
+    const int n = o + v;
+
+    /* Old approach which is constant time but doesn't have to be */
+    unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+    unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+    mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+    mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else 
+    NOT IMPLEMENTED
+#endif
+#else
+    const int n = o + v;
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+    mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+    mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+                           const unsigned char *y, const unsigned char *r,
+                           unsigned char *x, int k, int o, int m, int A_cols) {
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    unsigned char finished;
+    int col_upper_bound;
+    unsigned char correct_column;
+
+    // x <- r
+    for (int i = 0; i < k * o; i++) {
+        x[i] = r[i];
+    }
+
+    // compute Ar;
+    unsigned char Ar[M_MAX];
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+    }
+    mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+    // move y - Ar to last column of matrix A
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+    }
+
+    EF(A, m, k * o + 1);
+
+    // check if last row of A (excluding the last entry of y) is zero
+    unsigned char full_rank = 0;
+    for (int i = 0; i < A_cols - 1; i++) {
+        full_rank |= A[(m - 1) * A_cols + i];
+    }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+    if (full_rank == 0) {
+        return 0;
+    }
+
+    // back substitution in constant time
+    // the index of the first nonzero entry in each row is secret, which makes
+    // things less efficient
+
+    for (int row = m - 1; row >= 0; row--) {
+        finished = 0;
+        col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+        // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+        for (int col = row; col <= col_upper_bound; col++) {
+
+            // Compare two chars in constant time.
+            // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+            correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+            unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+            x[col] ^= u;
+
+            for (int i = 0; i < row; i += 8) {
+                uint64_t tmp = ( (uint64_t) A[ i    * A_cols + col] <<  0) ^ ( (uint64_t) A[(i+1) * A_cols + col] <<  8)
+                             ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+                             ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+                             ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+                tmp = mul_fx8(u, tmp);
+
+                A[ i    * A_cols + A_cols - 1] ^= (tmp      ) & 0xf;
+                A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+                A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+                A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+                A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+                A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+                A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+                A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+            }
+
+            finished = finished | correct_column;
+        }
+    }
+    return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <stdint.h>
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+    #include <shuffle_arithmetic_64.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+    #include <shuffle_arithmetic_96.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+    #include <shuffle_arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+    #include <arithmetic_64.h>
+    #include <arithmetic_96.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+    #include <arithmetic_96.h>
+    #include <arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+    #include <arithmetic_128.h>
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_128.h
new file mode 100644
index 0000000000..418c308e2f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_128.h
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_64.h>
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+    out[6] = in[6];
+    out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+    acc[6] ^= in[6];
+    acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 8;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 8;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 8;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_128(bins +  5 * 8, bins +  10 * 8);
+    m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+    m_vec_mul_add_x_inv_128(bins +  10 * 8, bins +  7 * 8);
+    m_vec_mul_add_x_128(bins + 12 * 8, bins +  6 * 8);
+    m_vec_mul_add_x_inv_128(bins +  7 * 8, bins +  14 * 8);
+    m_vec_mul_add_x_128(bins +  6 * 8, bins +  3 * 8);
+    m_vec_mul_add_x_inv_128(bins +  14 * 8, bins +  15 * 8);
+    m_vec_mul_add_x_128(bins +  3 * 8, bins +  8 * 8);
+    m_vec_mul_add_x_inv_128(bins +  15 * 8, bins +  13 * 8);
+    m_vec_mul_add_x_128(bins +  8 * 8, bins +  4 * 8);
+    m_vec_mul_add_x_inv_128(bins +  13 * 8, bins +  9 * 8);
+    m_vec_mul_add_x_128(bins +  4 * 8, bins +  2 * 8);
+    m_vec_mul_add_x_inv_128(bins +   9 * 8, bins +  1 * 8);
+    m_vec_mul_add_x_128(bins +  2 * 8, bins +  1 * 8);
+    vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_64.h
new file mode 100644
index 0000000000..a70b7a3118
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_64.h
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    uint64_t a_msb;
+    uint64_t a64 = a;
+    uint64_t b32 = b;
+    uint64_t r64 = a64 * (b32 & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 1) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 2) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 3) & 1);
+
+    return r64;
+}
+
+static inline uint32_t mul_table(uint8_t b){
+    uint32_t x = ((uint32_t) b) * 0x08040201;
+
+    uint32_t high_nibble_mask = 0xf0f0f0f0;
+
+    uint32_t high_half = x & high_nibble_mask;
+    return (x ^ (high_half >> 4) ^ (high_half >> 3));
+}
+
+static 
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 4;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < legs; i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 4;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 4;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_64(bins +  5 * 4, bins +  10 * 4);
+    m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+    m_vec_mul_add_x_inv_64(bins +  10 * 4, bins +  7 * 4);
+    m_vec_mul_add_x_64(bins + 12 * 4, bins +  6 * 4);
+    m_vec_mul_add_x_inv_64(bins +  7 * 4, bins +  14 * 4);
+    m_vec_mul_add_x_64(bins +  6 * 4, bins +  3 * 4);
+    m_vec_mul_add_x_inv_64(bins +  14 * 4, bins +  15 * 4);
+    m_vec_mul_add_x_64(bins +  3 * 4, bins +  8 * 4);
+    m_vec_mul_add_x_inv_64(bins +  15 * 4, bins +  13 * 4);
+    m_vec_mul_add_x_64(bins +  8 * 4, bins +  4 * 4);
+    m_vec_mul_add_x_inv_64(bins +  13 * 4, bins +  9 * 4);
+    m_vec_mul_add_x_64(bins +  4 * 4, bins +  2 * 4);
+    m_vec_mul_add_x_inv_64(bins +   9 * 4, bins +  1 * 4);
+    m_vec_mul_add_x_64(bins +  2 * 4, bins +  1 * 4);
+    vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_96.h
new file mode 100644
index 0000000000..a38f89e454
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_96.h
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_64.h>
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 6;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 6;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 6;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_96(bins +  5 * 6, bins +  10 * 6);
+    m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+    m_vec_mul_add_x_inv_96(bins +  10 * 6, bins +  7 * 6);
+    m_vec_mul_add_x_96(bins + 12 * 6, bins +  6 * 6);
+    m_vec_mul_add_x_inv_96(bins +  7 * 6, bins +  14 * 6);
+    m_vec_mul_add_x_96(bins +  6 * 6, bins +  3 * 6);
+    m_vec_mul_add_x_inv_96(bins +  14 * 6, bins +  15 * 6);
+    m_vec_mul_add_x_96(bins +  3 * 6, bins +  8 * 6);
+    m_vec_mul_add_x_inv_96(bins +  15 * 6, bins +  13 * 6);
+    m_vec_mul_add_x_96(bins +  8 * 6, bins +  4 * 6);
+    m_vec_mul_add_x_inv_96(bins +  13 * 6, bins +  9 * 6);
+    m_vec_mul_add_x_96(bins +  4 * 6, bins +  2 * 6);
+    m_vec_mul_add_x_inv_96(bins +   9 * 6, bins +  1 * 6);
+    m_vec_mul_add_x_96(bins +  2 * 6, bins +  1 * 6);
+    vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_common.h
new file mode 100644
index 0000000000..d337bc238c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/arithmetic_common.h
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include <stdalign.h>
+
+#ifndef MAYO_VARIANT
+static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) {
+
+    m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2);
+    m_vec_add(m_legs, bins + 15 * m_legs * 2, bins +  3 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 14 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 14 * m_legs * 2, bins +  6 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2);
+    m_vec_add(m_legs, bins + 13 * m_legs * 2, bins +  7 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 12 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 12 * m_legs * 2, bins +  4 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 11 * m_legs * 2, bins +  9 * m_legs * 2);
+    m_vec_add(m_legs, bins + 11 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 10 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 10 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 9 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 9 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 7 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 7 * m_legs * 2, bins +  3 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 6 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 6 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 5 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 5 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 3 * m_legs * 2, bins +  2 * m_legs * 2);
+    m_vec_add(m_legs, bins + 3 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2);
+    m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2);
+    m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2);
+
+    m_vec_copy(m_legs, bins + 1 * m_legs * 2, out);
+}
+#endif
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *PS) {
+
+    const int n = o + v;
+#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128)
+    (void)m;
+#else
+    const int m_legs = m / 32;
+#endif
+
+    /* Old approach which is constant time but doesn't have to be
+    unsigned char S1[V_MAX*K_MAX];
+    unsigned char S2[O_MAX*K_MAX];
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2.
+    */
+
+    // use more stack efficient version for MAYO_3 and MAYO_5
+    #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78
+    uint64_t accumulator[M_MAX * N_MAX] = {0};
+    int P1_used;
+    int P3_used;
+    for (int col = 0; col < k; col++) {
+        for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) {
+            accumulator[i] = 0;
+        }
+        P1_used = 0;
+        for (int row = 0; row < v; row++) {
+            for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8);
+#else
+                bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+                P1_used ++;
+            }
+
+            for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 );
+#else
+                bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+#endif
+            }
+        }
+
+        P3_used = 0;
+        for (int row = v; row < n; row++) {
+            for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8);
+#else
+                bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+                P3_used ++;
+            }
+        }
+
+        for (int row = 0; row < n; row++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+           multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+           multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+           multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8);
+#else
+           bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2);
+#endif
+        }
+    }
+
+    #else
+
+    alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0};
+    int P1_used = 0;
+    for (int row = 0; row < v; row++) {
+        for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+            }
+#endif
+            P1_used ++;
+        }
+
+
+        for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+            }
+#endif
+        }
+    }
+
+    int P3_used = 0;
+    for (int row = v; row < n; row++) {
+        for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+            }
+#endif
+            P3_used ++;
+        }
+    }
+
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < n * k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+        multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+        multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+        multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8);
+        i++;
+#else
+        m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2);
+        i++;
+#endif
+    }
+
+    #endif
+}
+
+
+static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int  n, uint64_t *SPS){
+    alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0};
+    #if !defined(MAYO_VARIANT)
+    const int m_legs = m/32;
+    #else
+    (void) m;
+    #endif
+    for (int row = 0; row < k; row++) {
+        for (int j = 0; j < n; j++) {
+            for (int col = 0; col < k; col += 1) {
+                #if defined(MAYO_VARIANT) && (M_MAX == 64)
+                    vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 );
+                #elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                    vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 );
+                #elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                    vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 );
+                #else
+                    m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 );
+                #endif
+            }
+        }
+    }
+
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < k*k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+        multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+        multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+        multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8);
+        i++;
+#else
+        m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2);
+        i++;
+#endif
+    }
+}
+
+
+// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) {
+
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k));
+#endif
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+
+// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) {
+
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_rows; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k));
+#endif
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+
+// multiplies the transpose of a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+    for (int r = 0; r < mat_cols; r++) {
+        for (int c = 0; c < mat_rows; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+            }
+        }
+    }
+}
+
+// multiplies a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+    for (int r = 0; r < mat_rows; r++) {
+        for (int c = 0; c < mat_cols; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+            }
+        }
+    }
+}
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo-3_opt/echelon_form.h
new file mode 100644
index 0000000000..82505847c9
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/echelon_form.h
@@ -0,0 +1,152 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ECHELON_FORM_H
+#define ECHELON_FORM_H
+
+#include <stdalign.h>
+#include <stdint.h>
+#include <mem.h>
+#include <arithmetic.h>
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+static inline unsigned char
+m_extract_element(const uint64_t *in, int index) {
+    const int leg = index / 16;
+    const int offset = index % 16;
+
+    return (in[leg] >> (offset*4)) & 0xF;
+}
+
+static inline void
+ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) {
+    int i;
+    unsigned char *out8 = (unsigned char *)out;
+    for(i = 0; i+1 < ncols; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0) | (in[i+1] << 4);
+#else
+        out8[i/2]  = (in[i+0] << 0) | (in[i+1] << 4);
+#endif
+    }
+    if (ncols % 2 == 1){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0);
+#else
+        out8[i/2]  = (in[i+0] << 0);
+#endif
+    }
+}
+
+static inline void
+ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) {
+    const unsigned char *in8 = (const unsigned char *)in;
+    for(int i = 0; i < legs * 16; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out[i]   = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF;
+        out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        out[i]   = (in8[i/2]) & 0xF;
+        out[i+1] = (in8[i/2] >> 4);
+#endif
+    }
+}
+
+
+// put matrix in row echelon form with ones on first nonzero entries *in
+// constant time*
+static inline void EF(unsigned char *A, int nrows, int ncols) {
+
+    alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 };
+
+    int row_len = (ncols + 15) / 16;
+
+    // nibbleslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols);
+    }
+
+    // pivot row is secret, pivot col is not
+
+    unsigned char inverse;
+    int pivot_row = 0;
+    for (int pivot_col = 0; pivot_col < ncols; pivot_col++) {
+
+        int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols);
+        int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col);
+        // the pivot row is guaranteed to be between these lower and upper bounds if
+        // A has full rank
+
+        // zero out pivot row
+        for (int i = 0; i < row_len; i++) {
+            _pivot_row[i] = 0;
+            _pivot_row2[i] = 0;
+        }
+
+        // try to get a pivot row in constant time
+        unsigned char pivot = 0;
+        uint64_t pivot_is_zero = -1;
+        for (int row = pivot_row_lower_bound;
+                row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+
+            uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row);
+            uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row);
+
+            for (int j = 0; j < row_len; j++) {
+                _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) &
+                                 packed_A[row * row_len + j];
+            }
+            pivot = m_extract_element(_pivot_row, pivot_col);
+            pivot_is_zero = ~ct_compare_64((int) pivot, 0);
+        }
+
+        // multiply pivot row by inverse of pivot
+        inverse = inverse_f(pivot);
+        vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2);
+
+        // conditionally write pivot row to the correct row, if there is a nonzero
+        // pivot
+        for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) {
+            uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero;
+            uint64_t do_not_copy = ~do_copy;
+            for (int col = 0; col < row_len; col++) {
+                packed_A[row * row_len + col] =
+                    (do_not_copy & packed_A[row * row_len + col]) +
+                    (do_copy & _pivot_row2[col]);
+            }
+        }
+
+        // eliminate entries below pivot
+        for (int row = pivot_row_lower_bound; row < nrows; row++) {
+            unsigned char below_pivot = (row > pivot_row);
+            unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col);
+
+            vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim,
+                                    packed_A + row * row_len);                            
+        }
+
+        pivot_row += (-(int64_t)(~pivot_is_zero));
+    }
+
+    unsigned char temp[(O_MAX * K_MAX + 1 + 15)];
+
+    // unbitslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_unpack_m_vec(row_len, packed_A + i * row_len, temp);
+        for (int j = 0; j < ncols; j++) {
+            A[i * ncols + j] = temp[j];
+        }
+    }
+
+    mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15);
+    mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/mayo.c b/src/sig/mayo/pqmayo_mayo-3_opt/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mem.h>
+#include <mayo.h>
+#include <randombytes.h>
+#include <aes_ctr.h>
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <fips202.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+    int i;
+    for (i = 0; i < mdeclen / 2; ++i) {
+        *mdec++ = m[i] & 0xf;
+        *mdec++ = m[i] >> 4;
+    }
+
+    if (mdeclen % 2 == 1) {
+        *mdec++ = m[i] & 0x0f;
+    }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+    int i;
+    for (i = 0; i < mlen / 2; ++i, m += 2) {
+        menc[i] = (*m) | (*(m + 1) << 4);
+    }
+
+    if (mlen % 2 == 1) {
+        menc[i] = (*m);
+    }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+
+    const uint64_t *vPv = _vPv;
+    uint64_t temp[M_MAX/16] = {0};
+    unsigned char *temp_bytes = (unsigned char *) temp;
+    int k = 0;
+    for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+        for (int j = i; j < PARAM_k(p); j++) {
+            // multiply by X (shift up 4 bits)
+            unsigned char top = temp[k] >> 60;
+            temp[k] <<= 4;
+            k--;
+            for(; k>=0; k--){
+                temp[k+1] ^= temp[k] >> 60;
+                temp[k] <<= 4;
+            }
+            // reduce mod f(X)
+            for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+                if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+                }
+                else {
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+                }
+            }
+
+            // extract from vPv and add
+            for(k=0; k < PARAM_m(p)/16; k ++){
+                temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+            }
+            k--;
+        }
+    }
+
+    // add to y
+    for (int i = 0; i < PARAM_m(p); i+=2)
+    {
+#ifdef TARGET_BIG_ENDIAN
+        y[i]   = t[i]   ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        y[i]   = t[i]   ^ (temp_bytes[i/2] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+    }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+    static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+    static const uint64_t even_bytes   = 0x00ff00ff00ff00ff;
+    static const uint64_t even_2bytes  = 0x0000ffff0000ffff;
+    static const uint64_t even_half    = 0x00000000ffffffff;
+
+    for (size_t i = 0; i < 16; i+=2)
+    {
+        uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; 
+        M[i  ] ^= t << 4;
+        M[i+1] ^= t;
+    }
+
+    for (size_t i = 0; i < 16; i+=4)
+    {
+        uint64_t t0 = ((M[i  ] >> 8) ^ M[i+2]) & even_bytes; 
+        uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; 
+        M[i  ] ^= (t0 << 8);
+        M[i+1] ^= (t1 << 8);
+        M[i+2] ^= t0;
+        M[i+3] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 4; i++)
+    {
+        uint64_t t0 = ((M[i  ] >> 16) ^ M[i+ 4]) & even_2bytes;
+        uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+        M[i   ] ^= t0 << 16;
+        M[i+ 8] ^= t1 << 16;
+        M[i+ 4] ^= t0;
+        M[i+12] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 8; i++)
+    {
+        uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; 
+        M[i  ] ^= t << 32;
+        M[i+8] ^= t;
+    }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    
+    const uint64_t *VtL = _VtL;
+    int bits_to_shift = 0;
+    int words_to_shift = 0;
+    uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+    size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+    const uint64_t *Mi, *Mj;
+
+    for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+        for (int j = PARAM_k(p) - 1; j >= i; --j) {
+            // add the M_i and M_j to A, shifted "down" by l positions
+            Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+            for (int c = 0; c < PARAM_o(p); c++) {
+                for (int k = 0; k < PARAM_m(p)/16; k++)
+                {
+                    A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+                    if(bits_to_shift > 0){
+                        A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                    }
+                }
+            }
+
+            if (i != j) {
+                Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+                for (int c = 0; c < PARAM_o(p); c++) {
+                    for (int k = 0; k < PARAM_m(p)/16; k++)
+                    {
+                        A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+                        if(bits_to_shift > 0){
+                            A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                        }
+                    }
+                }
+            }
+
+            bits_to_shift += 4;
+            if(bits_to_shift == 64){
+                words_to_shift ++;
+                bits_to_shift = 0;
+            }
+        }
+    }
+
+    for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+    {
+        transpose_16x16_nibbles(A + c);
+    }
+
+    unsigned char tab[F_TAIL_LEN*4] = {0};
+    for (size_t i = 0; i < F_TAIL_LEN; i++)
+    {
+        tab[4*i]   = mul_f(PARAM_f_tail(p)[i],1);
+        tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+        tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+        tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+    }
+
+    uint64_t low_bit_in_nibble = 0x1111111111111111;
+    
+    for (size_t c = 0; c < A_width; c+= 16)
+    {
+        for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+        {
+            size_t pos = (r/16)*A_width + c + (r%16);
+            uint64_t t0 =  A[pos]       & low_bit_in_nibble;
+            uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+            uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+            uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+            for (size_t t = 0; t < F_TAIL_LEN; t++)
+            {
+                A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+            }
+        }
+    }
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) 
+        A[i] = BSWAP64(A[i]);
+#endif
+
+    for (int r = 0; r < PARAM_m(p); r+=16)
+    {
+        for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+        {
+            for (size_t i = 0; i < 16; i++)
+            {
+                decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+            }   
+        }
+    }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+    int ret = 0;
+
+    ret = mayo_keypair_compact(p, pk, sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+    unsigned char y[M_MAX];                    // secret data
+    unsigned char salt[SALT_BYTES_MAX];        // not secret data
+    unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+             Vdec[N_MINUS_O_MAX * K_MAX];                 // secret data
+    unsigned char A[M_MAX * (K_MAX * O_MAX + 1)];   // secret data
+    unsigned char x[K_MAX * N_MAX];                       // not secret data
+    unsigned char r[K_MAX * O_MAX + 1] = { 0 };           // secret data
+    unsigned char s[K_MAX * N_MAX];                       // not secret data
+    const unsigned char *seed_sk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+    alignas(32) sk_t sk;                    // secret data
+    unsigned char Ox[N_MINUS_O_MAX];        // secret data
+    // unsigned char Mdigest[DIGEST_BYTES];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+    unsigned char *ctrbyte;
+    unsigned char *vi;
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_v_bytes = PARAM_v_bytes(p);
+    const int param_r_bytes = PARAM_r_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_A_cols = PARAM_A_cols(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    ret = mayo_expand_sk(p, csk, &sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+    seed_sk = csk;
+    decode(sk.o, O, (param_n - param_o) * param_o);
+
+    // hash message
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    uint64_t *P1 = sk.p;
+    uint64_t *L  = P1 + (param_P1_bytes/8);
+    alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        L[i] = BSWAP64(L[i]);
+    }
+#endif
+
+    // choose the randomizer
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(tmp + param_digest_bytes, param_salt_bytes);
+    #else
+    if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // hashing to salt
+    memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+           param_sk_seed_bytes);
+    shake256(salt, param_salt_bytes, tmp,
+             param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+    // hashing to t
+    memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+    ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+    shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+    decode(tenc, t, param_m); // may not be necessary
+
+    for (int ctr = 0; ctr <= 255; ++ctr) {
+        *ctrbyte = (unsigned char)ctr;
+
+        shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+                 param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+        // decode the v_i vectors
+        for (int i = 0; i <= param_k - 1; ++i) {
+            decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+                   param_n - param_o);
+        }
+
+        // compute all the V * L matrices.
+        // compute all the V * P1 * V^T matrices.
+        alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+        V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+        compute_rhs(p, Y, t, y);
+        compute_A(p, Mtmp, A);
+
+        decode(V + param_k * param_v_bytes, r,
+               param_k *
+               param_o);
+        if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+            break;
+        } else {
+            memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+        }
+    }
+
+    // s is already 0
+    // TODO: optimize this?
+    for (int i = 0; i <= param_k - 1; ++i) {
+        vi = Vdec + i * (param_n - param_o);
+        mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+        mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+        memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+    }
+    encode(s, sig, param_n * param_k);
+    memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+    *siglen = param_sig_bytes;
+err:
+    mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+    mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+    mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+    mayo_secure_clear(r, K_MAX * O_MAX + 1);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(&sk, sizeof(sk_t));
+    mayo_secure_clear(Ox, N_MINUS_O_MAX);
+    mayo_secure_clear(tmp,
+                      DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+    return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    size_t siglen = param_sig_bytes;
+    ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+    if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+        goto err;
+
+    memmove(sm + param_sig_bytes, m, mlen);
+    *smlen = siglen + mlen;
+err:
+    return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk) {
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    if (smlen < (size_t)param_sig_bytes) {
+        return MAYO_ERR;
+    }
+    int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+                             pk);
+
+    if (result == MAYO_OK) {
+        *mlen = smlen - param_sig_bytes;
+        memmove(m, sm + param_sig_bytes, *mlen);
+    }
+
+    return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char *seed_sk = csk;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+    alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+    unsigned char *seed_pk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    // seed_sk $←- B^(sk_seed bytes)
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(seed_sk, param_sk_seed_bytes);
+    #else
+    if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // S ← shake256(seedsk, pk seed bytes + O bytes)
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    // seed_pk ← s[0 : pk_seed_bytes]
+    seed_pk = S;
+
+    // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+    decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+
+    int m_legs = param_m / 32;
+
+    uint64_t *P1 = P;
+    uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+    // compute P3 = O^t * (P1*O + P2)
+    Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+    // store seed_pk in cpk
+    memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+    alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+    // compute Upper(P3) and store in cpk
+    m_upper(m_legs, P3, P3_upper, param_o);
+    
+    memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+    return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *pk) {
+    #ifdef MAYO_VARIANT
+    (void)p;
+    #endif
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+    pk += param_P1_bytes + param_P2_bytes;
+    memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+    return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *sk) {
+    int ret = MAYO_OK;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    uint64_t *P = sk->p;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_v(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    const unsigned char *seed_sk = csk;
+    unsigned char *seed_pk = S;
+
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    decode(S + param_pk_seed_bytes, O,
+           param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+    uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+    
+    uint64_t *P1 = P;
+    // compute L_i = (P1 + P1^t)*O + P2
+    uint64_t *L = P2;
+    P1P1t_times_O(p, P1, O, L);
+
+    // write to sk
+    memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+
+    mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *cpk) {
+    unsigned char tEnc[M_BYTES_MAX];
+    unsigned char t[M_MAX];
+    unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+    unsigned char s[K_MAX * N_MAX];
+    alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+    if (ret != MAYO_OK) {
+        return MAYO_ERR;
+    }
+
+    uint64_t *P1 = pk;
+    uint64_t *P2 = pk + (param_P1_bytes / 8);
+    uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        P2[i] = BSWAP64(P2[i]);
+    }
+    for (int i = 0; i < param_P3_bytes / 8; ++i) {
+        P3[i] = BSWAP64(P3[i]);
+    }
+#endif
+
+    // hash m
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    // compute t
+    memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+           param_salt_bytes);
+    shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+    decode(tEnc, t, param_m);
+
+    // decode s
+    decode(sig, s, param_k * param_n);
+
+    // Compute S*P*S^T
+    alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+    m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+                             param_v, param_o, param_k, SPS);
+
+    // combine the vectors in SPS and reduce mod f(X)
+    compute_rhs(p, SPS, y, y);
+    
+    if (memcmp(y, t, param_m) == 0) {
+        return MAYO_OK; // good signature
+    }
+    return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/mayo.h b/src/sig/mayo/pqmayo_mayo-3_opt/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64                                                              \
+  { 8, 0, 2, 8, 0 } // f(z) =  z^64         + x^3*z^3 + x*z^2         + x^3
+#define F_TAIL_96                                                              \
+  { 2, 2, 0, 2, 0 } // f(z) =  z^96         +   x*z^3         + x*z   + x
+#define F_TAIL_128                                                             \
+  { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3         + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+    int m;
+    int n;
+    int o;
+    int k;
+    int q;
+    const unsigned char *f_tail;
+    int m_bytes;
+    int O_bytes;
+    int v_bytes;
+    int r_bytes;
+    int R_bytes;
+    int P1_bytes;
+    int P2_bytes;
+    int P3_bytes;
+    int csk_bytes;
+    int esk_bytes;
+    int cpk_bytes;
+    int epk_bytes;
+    int sig_bytes;
+    int salt_bytes;
+    int sk_seed_bytes;
+    int digest_bytes;
+    int pk_seed_bytes;
+    const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+    uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+    uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ * 
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/mem.h b/src/sig/mayo/pqmayo_mayo-3_opt/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+    int32_t diff = b - a;
+    return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+    int64_t diff = ((int64_t) b) - ((int64_t) a);
+    return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+    return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+    return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+    return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include <oqs/common.h>
+/**
+ * Clears and frees allocated memory.
+ * 
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+    OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ * 
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+    OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/params.c b/src/sig/mayo/pqmayo_mayo-3_opt/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+  const mayo_params_t nm = { \
+    .m = PARAM_JOIN2(nm, m), \
+    .n = PARAM_JOIN2(nm, n), \
+    .o = PARAM_JOIN2(nm, o), \
+    .k = PARAM_JOIN2(nm, k), \
+    .q = PARAM_JOIN2(nm, q), \
+    .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+    .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+    .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+    .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+    .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+    .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+    .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+    .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+    .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+    .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+    .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+    .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+    .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+    .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+    .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+    .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+    .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+    .name = #nm \
+  };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-3_opt/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-3_opt/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-3_opt/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+    // carryless multiply
+    unsigned char p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    unsigned char top_p = p & 0xf0;
+    unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+    return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+    // carryless multiply
+    uint64_t p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+    uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+    return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+    return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+    // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+    // 10, 4, 3, 8}; return table[a & 15];
+
+    unsigned char a2 = mul_f(a, a);
+    unsigned char a4 = mul_f(a2, a2);
+    unsigned char a8 = mul_f(a4, a4);
+    unsigned char a6 = mul_f(a2, a4);
+    unsigned char a14 = mul_f(a8, a6);
+
+    return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+                                    const unsigned char *b, int n, int m) {
+    unsigned char ret = 0;
+    for (int i = 0; i < n; ++i, b += m) {
+        ret = add_f(mul_f(a[i], *b), ret);
+    }
+    return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int colrow_ab, int row_a, int col_b) {
+    for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+        for (int j = 0; j < col_b; ++j, ++c) {
+            *c = lincomb(a, b + j, colrow_ab, col_b);
+        }
+    }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int m, int n) {
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+        }
+    }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+                                 uint64_t *out) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        out[i] = in[i];
+    }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+                                uint64_t *acc) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        acc[i] ^= in[i];
+    }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+    uint32_t a_msb;
+    uint32_t a32 = a;
+    uint32_t b32 = b;
+    uint32_t r32 = a32 * (b32 & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 1) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 2) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 3) & 1);
+
+    return r32;
+
+}
+ 
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/LICENSE b/src/sig/mayo/pqmayo_mayo-5_avx2/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/NOTICE b/src/sig/mayo/pqmayo_mayo-5_avx2/NOTICE
new file mode 100644
index 0000000000..53da47c21c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/NOTICE
@@ -0,0 +1,13 @@
+Copyright 2023 the MAYO team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/aes_ctr.h b/src/sig/mayo/pqmayo_mayo-5_avx2/aes_ctr.h
new file mode 100644
index 0000000000..8e41c30f2c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/aes_ctr.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AESCTR_H
+#define AESCTR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+                   const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                      const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include <aes.h>
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen) {
+    (void) inputByteLen;
+    uint8_t iv[12] = { 0 };
+    aes128ctr_prf(output, outputByteLen, input, iv);
+    return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/api.c b/src/sig/mayo/pqmayo_mayo-5_avx2/api.c
new file mode 100644
index 0000000000..f2e861e9c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_5
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+    return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk) {
+    return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk) {
+    return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk) {
+    return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk) {
+    if (siglen != CRYPTO_BYTES)
+        return -1;
+    return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/api.h b/src/sig/mayo/pqmayo_mayo-5_avx2/api.h
new file mode 100644
index 0000000000..404d185c08
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <mayo.h>
+
+#define CRYPTO_SECRETKEYBYTES 40
+#define CRYPTO_PUBLICKEYBYTES 5008
+#define CRYPTO_BYTES 838
+
+#define CRYPTO_ALGNAME "MAYO-5"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.c b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <arithmetic_common.h>
+#include <mem.h>
+#include <echelon_form.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+    mayo12_m_upper(m_legs, in, out, size);
+#else
+    int m_vecs_stored = 0;
+    for (int r = 0; r < size; r++) {
+        for (int c = r; c < size; c++) {
+            m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+            if (r != c) {
+                m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+            }
+            m_vecs_stored ++;
+        }
+    }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    (void) p;
+    mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    (void) p;   
+    mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    (void) p;   
+    mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+    #ifndef MAYO_VARIANT
+    const int m_legs = PARAM_m(p) / 32;
+    #else
+    (void) p;
+    #endif
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+    int 
+    bs_mat_entries_used = 0;
+    for (int r = 0; r < param_v; r++) {
+        for (int c = r; c < param_v; c++) {
+            if(c==r) {
+                bs_mat_entries_used += 1;
+                continue;
+            }
+            for (int k = 0; k < param_o; k += 1) {
+                
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+                vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k],  acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k],  acc + 6 * (r * param_o + k));
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k],  acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k],  acc + 8 * (r * param_o + k));
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k],  acc + 8 * (c * param_o + k));
+#else
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+        param_k, param_n - param_o, param_o);
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+    mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+        Y, param_k, param_n - param_o,
+        param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int m_legs = PARAM_m(p) / 32;
+    mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+    mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS) {
+    (void) m;
+#if MAYO_AVX
+    const int n = o + v;
+
+    /* Old approach which is constant time but doesn't have to be */
+    unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+    unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+    mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+    mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else 
+    NOT IMPLEMENTED
+#endif
+#else
+    const int n = o + v;
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+    mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+    mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+                           const unsigned char *y, const unsigned char *r,
+                           unsigned char *x, int k, int o, int m, int A_cols) {
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    unsigned char finished;
+    int col_upper_bound;
+    unsigned char correct_column;
+
+    // x <- r
+    for (int i = 0; i < k * o; i++) {
+        x[i] = r[i];
+    }
+
+    // compute Ar;
+    unsigned char Ar[M_MAX];
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+    }
+    mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+    // move y - Ar to last column of matrix A
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+    }
+
+    EF(A, m, k * o + 1);
+
+    // check if last row of A (excluding the last entry of y) is zero
+    unsigned char full_rank = 0;
+    for (int i = 0; i < A_cols - 1; i++) {
+        full_rank |= A[(m - 1) * A_cols + i];
+    }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+    if (full_rank == 0) {
+        return 0;
+    }
+
+    // back substitution in constant time
+    // the index of the first nonzero entry in each row is secret, which makes
+    // things less efficient
+
+    for (int row = m - 1; row >= 0; row--) {
+        finished = 0;
+        col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+        // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+        for (int col = row; col <= col_upper_bound; col++) {
+
+            // Compare two chars in constant time.
+            // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+            correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+            unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+            x[col] ^= u;
+
+            for (int i = 0; i < row; i += 8) {
+                uint64_t tmp = ( (uint64_t) A[ i    * A_cols + col] <<  0) ^ ( (uint64_t) A[(i+1) * A_cols + col] <<  8)
+                             ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+                             ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+                             ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+                tmp = mul_fx8(u, tmp);
+
+                A[ i    * A_cols + A_cols - 1] ^= (tmp      ) & 0xf;
+                A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+                A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+                A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+                A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+                A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+                A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+                A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+            }
+
+            finished = finished | correct_column;
+        }
+    }
+    return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.h b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <stdint.h>
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+    #include <shuffle_arithmetic_64.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+    #include <shuffle_arithmetic_96.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+    #include <shuffle_arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+    #include <arithmetic_64.h>
+    #include <arithmetic_96.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+    #include <arithmetic_96.h>
+    #include <arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+    #include <arithmetic_128.h>
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_128.h
new file mode 100644
index 0000000000..27b367e940
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_128.h
@@ -0,0 +1,78 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+    out[6] = in[6];
+    out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+    acc[6] ^= in[6];
+    acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 8;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static 
+    inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_128(bins +  5 * 8, bins +  10 * 8);
+    m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+    m_vec_mul_add_x_inv_128(bins +  10 * 8, bins +  7 * 8);
+    m_vec_mul_add_x_128(bins + 12 * 8, bins +  6 * 8);
+    m_vec_mul_add_x_inv_128(bins +  7 * 8, bins +  14 * 8);
+    m_vec_mul_add_x_128(bins +  6 * 8, bins +  3 * 8);
+    m_vec_mul_add_x_inv_128(bins +  14 * 8, bins +  15 * 8);
+    m_vec_mul_add_x_128(bins +  3 * 8, bins +  8 * 8);
+    m_vec_mul_add_x_inv_128(bins +  15 * 8, bins +  13 * 8);
+    m_vec_mul_add_x_128(bins +  8 * 8, bins +  4 * 8);
+    m_vec_mul_add_x_inv_128(bins +  13 * 8, bins +  9 * 8);
+    m_vec_mul_add_x_128(bins +  4 * 8, bins +  2 * 8);
+    m_vec_mul_add_x_inv_128(bins +   9 * 8, bins +  1 * 8);
+    m_vec_mul_add_x_128(bins +  2 * 8, bins +  1 * 8);
+    vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_64.h
new file mode 100644
index 0000000000..9f7535c878
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_64.h
@@ -0,0 +1,69 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+}
+
+static 
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<4;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_64(bins +  5 * 4, bins +  10 * 4);
+    m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+    m_vec_mul_add_x_inv_64(bins +  10 * 4, bins +  7 * 4);
+    m_vec_mul_add_x_64(bins + 12 * 4, bins +  6 * 4);
+    m_vec_mul_add_x_inv_64(bins +  7 * 4, bins +  14 * 4);
+    m_vec_mul_add_x_64(bins +  6 * 4, bins +  3 * 4);
+    m_vec_mul_add_x_inv_64(bins +  14 * 4, bins +  15 * 4);
+    m_vec_mul_add_x_64(bins +  3 * 4, bins +  8 * 4);
+    m_vec_mul_add_x_inv_64(bins +  15 * 4, bins +  13 * 4);
+    m_vec_mul_add_x_64(bins +  8 * 4, bins +  4 * 4);
+    m_vec_mul_add_x_inv_64(bins +  13 * 4, bins +  9 * 4);
+    m_vec_mul_add_x_64(bins +  4 * 4, bins +  2 * 4);
+    m_vec_mul_add_x_inv_64(bins +   9 * 4, bins +  1 * 4);
+    m_vec_mul_add_x_64(bins +  2 * 4, bins +  1 * 4);
+    vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_96.h
new file mode 100644
index 0000000000..86359679fb
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_96.h
@@ -0,0 +1,74 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_common.h>
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x9);
+    }
+}
+
+static 
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < 6;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static 
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_96(bins +  5 * 6, bins +  10 * 6);
+    m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+    m_vec_mul_add_x_inv_96(bins +  10 * 6, bins +  7 * 6);
+    m_vec_mul_add_x_96(bins + 12 * 6, bins +  6 * 6);
+    m_vec_mul_add_x_inv_96(bins +  7 * 6, bins +  14 * 6);
+    m_vec_mul_add_x_96(bins +  6 * 6, bins +  3 * 6);
+    m_vec_mul_add_x_inv_96(bins +  14 * 6, bins +  15 * 6);
+    m_vec_mul_add_x_96(bins +  3 * 6, bins +  8 * 6);
+    m_vec_mul_add_x_inv_96(bins +  15 * 6, bins +  13 * 6);
+    m_vec_mul_add_x_96(bins +  8 * 6, bins +  4 * 6);
+    m_vec_mul_add_x_inv_96(bins +  13 * 6, bins +  9 * 6);
+    m_vec_mul_add_x_96(bins +  4 * 6, bins +  2 * 6);
+    m_vec_mul_add_x_inv_96(bins +   9 * 6, bins +  1 * 6);
+    m_vec_mul_add_x_96(bins +  2 * 6, bins +  1 * 6);
+    vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_common.h
new file mode 100644
index 0000000000..eeb13dc0bd
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/arithmetic_common.h
@@ -0,0 +1,175 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include <stdint.h>
+#include <immintrin.h>
+
+#define K_OVER_2 ((K_MAX+1)/2)
+
+static const unsigned char __gf16_mulbase[128] __attribute__((aligned(32))) = {
+    0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f,
+    0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d, 0x00, 0x02, 0x04, 0x06, 0x08, 0x0a, 0x0c, 0x0e, 0x03, 0x01, 0x07, 0x05, 0x0b, 0x09, 0x0f, 0x0d,
+    0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09, 0x00, 0x04, 0x08, 0x0c, 0x03, 0x07, 0x0b, 0x0f, 0x06, 0x02, 0x0e, 0x0a, 0x05, 0x01, 0x0d, 0x09,
+    0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01, 0x00, 0x08, 0x03, 0x0b, 0x06, 0x0e, 0x05, 0x0d, 0x0c, 0x04, 0x0f, 0x07, 0x0a, 0x02, 0x09, 0x01
+};
+
+//
+// generate multiplication table for '4-bit' variable 'b'. Taken from OV paper!
+//
+static inline __m256i tbl32_gf16_multab2( uint8_t b ) {
+
+    __m256i bx = _mm256_set1_epi16( b & 0xf );
+    __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+    const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+    const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+    const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+    const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+    __m256i mask_1  = _mm256_set1_epi16(1);
+    __m256i mask_4  = _mm256_set1_epi16(4);
+    __m256i mask_0  = _mm256_setzero_si256();
+
+    return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+           ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+           ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+           ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+static inline __m256i linear_transform_8x8_256b( __m256i tab_l, __m256i tab_h, __m256i v, __m256i mask_f ) {
+    return _mm256_shuffle_epi8(tab_l, v & mask_f)^_mm256_shuffle_epi8(tab_h, _mm256_srli_epi16(v, 4)&mask_f);
+}
+
+static inline __m256i gf16v_mul_avx2( __m256i a, uint8_t b ) {
+    __m256i multab_l = tbl32_gf16_multab2( b );
+    __m256i multab_h = _mm256_slli_epi16( multab_l, 4 );
+
+    return linear_transform_8x8_256b( multab_l, multab_h, a, _mm256_set1_epi8(0xf) );
+}
+
+static 
+inline void mayo_O_multabs_avx2(const unsigned char *O, __m256i *O_multabs){
+    // build multiplication tables 
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        for (size_t c = 0; c < O_MAX; c+=2)
+        {
+            O_multabs[O_MAX/2*r + c/2] = tbl32_gf16_multab2(O[O_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(O[O_MAX*r + c + 1]), 4);
+        }
+    }
+}
+
+
+static 
+inline void mayo_V_multabs_avx2(const unsigned char *V, __m256i *V_multabs){
+    // build multiplication tables 
+    size_t r;
+    for (size_t c = 0; c < V_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            V_multabs[K_OVER_2*c +  r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]) ^ _mm256_slli_epi16(tbl32_gf16_multab2(V[V_MAX*(r+1) + c]), 4);
+        }
+#if K_MAX % 2 == 1
+        V_multabs[K_OVER_2*c + r/2] = tbl32_gf16_multab2(V[V_MAX*r + c]);
+#endif
+    }
+}
+
+static const unsigned char mayo_gf16_mul[512] __attribute__((aligned(32))) = {
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 
+    0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00, 0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,
+    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f, 
+    0x00,0x01,0x02,0x03,0x04,0x05,0x06,0x07, 0x08,0x09,0x0a,0x0b,0x0c,0x0d,0x0e,0x0f,
+    0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d, 
+    0x00,0x02,0x04,0x06,0x08,0x0a,0x0c,0x0e, 0x03,0x01,0x07,0x05,0x0b,0x09,0x0f,0x0d,
+    0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02, 
+    0x00,0x03,0x06,0x05,0x0c,0x0f,0x0a,0x09, 0x0b,0x08,0x0d,0x0e,0x07,0x04,0x01,0x02,
+    0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09, 
+    0x00,0x04,0x08,0x0c,0x03,0x07,0x0b,0x0f, 0x06,0x02,0x0e,0x0a,0x05,0x01,0x0d,0x09,
+    0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06, 
+    0x00,0x05,0x0a,0x0f,0x07,0x02,0x0d,0x08, 0x0e,0x0b,0x04,0x01,0x09,0x0c,0x03,0x06,
+    0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04, 
+    0x00,0x06,0x0c,0x0a,0x0b,0x0d,0x07,0x01, 0x05,0x03,0x09,0x0f,0x0e,0x08,0x02,0x04,
+    0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b, 
+    0x00,0x07,0x0e,0x09,0x0f,0x08,0x01,0x06, 0x0d,0x0a,0x03,0x04,0x02,0x05,0x0c,0x0b,
+    0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01, 
+    0x00,0x08,0x03,0x0b,0x06,0x0e,0x05,0x0d, 0x0c,0x04,0x0f,0x07,0x0a,0x02,0x09,0x01,
+    0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e, 
+    0x00,0x09,0x01,0x08,0x02,0x0b,0x03,0x0a, 0x04,0x0d,0x05,0x0c,0x06,0x0f,0x07,0x0e,
+    0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c, 
+    0x00,0x0a,0x07,0x0d,0x0e,0x04,0x09,0x03, 0x0f,0x05,0x08,0x02,0x01,0x0b,0x06,0x0c,
+    0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03, 
+    0x00,0x0b,0x05,0x0e,0x0a,0x01,0x0f,0x04, 0x07,0x0c,0x02,0x09,0x0d,0x06,0x08,0x03,
+    0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08, 
+    0x00,0x0c,0x0b,0x07,0x05,0x09,0x0e,0x02, 0x0a,0x06,0x01,0x0d,0x0f,0x03,0x04,0x08,
+    0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07, 
+    0x00,0x0d,0x09,0x04,0x01,0x0c,0x08,0x05, 0x02,0x0f,0x0b,0x06,0x03,0x0e,0x0a,0x07,
+    0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05, 
+    0x00,0x0e,0x0f,0x01,0x0d,0x03,0x02,0x0c, 0x09,0x07,0x06,0x08,0x04,0x0a,0x0b,0x05,
+    0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a, 
+    0x00,0x0f,0x0d,0x02,0x09,0x06,0x04,0x0b, 0x01,0x0e,0x0c,0x03,0x08,0x07,0x05,0x0a};
+
+
+static 
+inline void mayo_S1_multabs_avx2(const unsigned char *S1, __m256i *S1_multabs) {
+    size_t r;
+    for (size_t c = 0; c < V_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            S1_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c])) 
+                                          ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*(r+1) + c])), 4);
+        }
+#if K_MAX % 2 == 1
+        S1_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S1[V_MAX*r + c]));
+#endif
+    }
+}
+
+static 
+inline void mayo_S2_multabs_avx2(const unsigned char *S2, __m256i *S2_multabs) {
+    // build multiplication tables 
+    size_t r;
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        for (r = 0; r+1 < K_MAX; r+= 2)
+        {
+            S2_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) 
+                                          ^ _mm256_slli_epi16(_mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*(r+1) + c])), 4);
+        }
+#if K_MAX % 2 == 1
+        S2_multabs[K_OVER_2*c +  r/2] = _mm256_load_si256((__m256i *)(mayo_gf16_mul + 32*S2[O_MAX*r + c])) ;
+#endif
+    }
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    uint64_t a_msb;
+    uint64_t a64 = a;
+    uint64_t b32 = b;
+    uint64_t r64 = a64 * (b32 & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 1) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 2) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 3) & 1);
+
+    return r64;
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form.h b/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form.h
new file mode 100644
index 0000000000..fa69de0ab2
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form.h
@@ -0,0 +1,88 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <immintrin.h>
+#include <stdint.h>
+
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+
+//
+// generate multiplication table for '4-bit' variable 'b'. From https://eprint.iacr.org/2023/059/.
+//
+static inline __m256i tbl32_gf16_multab( uint8_t b ) {
+    __m256i bx = _mm256_set1_epi16( b & 0xf );
+    __m256i b1 = _mm256_srli_epi16( bx, 1 );
+
+    const __m256i tab0 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 0));
+    const __m256i tab1 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 1));
+    const __m256i tab2 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 2));
+    const __m256i tab3 = _mm256_load_si256((__m256i const *) (__gf16_mulbase + 32 * 3));
+
+    __m256i mask_1  = _mm256_set1_epi16(1);
+    __m256i mask_4  = _mm256_set1_epi16(4);
+    __m256i mask_0  = _mm256_setzero_si256();
+
+    return ( tab0 & _mm256_cmpgt_epi16( bx & mask_1, mask_0) )
+           ^ ( tab1 & _mm256_cmpgt_epi16( b1 & mask_1, mask_0) )
+           ^ ( tab2 & _mm256_cmpgt_epi16( bx & mask_4, mask_0) )
+           ^ ( tab3 & _mm256_cmpgt_epi16( b1 & mask_4, mask_0) );
+}
+
+/* put matrix in row echelon form with ones on first nonzero entries in constant time*/
+static inline void EF(unsigned char *A, int _nrows, int _ncols) {
+
+    (void) _nrows;
+    (void) _ncols;
+
+    #define nrows M_MAX
+    #define ncols (K_MAX * O_MAX + 1)
+
+    #define AVX_REGS_PER_ROW ((K_MAX * O_MAX + 1 + 31) / 32)
+    #define MAX_COLS (AVX_REGS_PER_ROW * 32)
+
+    __m256i _pivot_row[AVX_REGS_PER_ROW];
+    __m256i A_avx[AVX_REGS_PER_ROW* M_MAX];
+
+    unsigned char* pivot_row_bytes = (unsigned char*) _pivot_row;
+    unsigned char* A_bytes = (unsigned char*) A_avx;
+
+    // load A in the tail of AVX2 registers
+    for (int i = 0; i < nrows; i++) {
+        for (int j = 0; j < ncols; j++)
+        {
+            A_bytes[i*MAX_COLS + (MAX_COLS - ncols) + j] = A[ i*ncols + j ];
+        }
+    }
+
+    // pivot row is secret, pivot col is not
+    unsigned char inverse;
+    int pivot_row = 0;
+    int pivot_col = MAYO_MAX(MAX_COLS - ncols,0);
+    for (; pivot_col < MAX_COLS-128; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-96; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-64; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS-32; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+    for (; pivot_col < MAX_COLS; pivot_col++) {
+        #include "echelon_form_loop.h"
+    }
+
+    // write the matrix A back
+    for (int i = 0; i < nrows; i++) {
+        for (int j = 0; j < ncols; j++) {
+            A[i * ncols + j] = A_bytes[i*AVX_REGS_PER_ROW*32 + (MAX_COLS - ncols) + j];
+        }
+    }
+    mayo_secure_clear(_pivot_row, AVX_REGS_PER_ROW * 32);
+    mayo_secure_clear(A_avx, AVX_REGS_PER_ROW * 32 * nrows);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form_loop.h b/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form_loop.h
new file mode 100644
index 0000000000..b8b29741c4
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/echelon_form_loop.h
@@ -0,0 +1,58 @@
+// SPDX-License-Identifier: Apache-2.0
+
+int pivot_col_rounded = pivot_col/32;
+
+int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - MAX_COLS);
+int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col - MAX_COLS + ncols);
+/* the pivot row is guaranteed to be between these lower and upper bounds if A has full rank*/
+
+/* zero out pivot row */
+for (int i = pivot_col_rounded; i < AVX_REGS_PER_ROW; i++) {
+    _pivot_row[i] = _mm256_set1_epi8(0);
+}
+
+/* try to get a pivot row in constant time */
+unsigned char pivot = 0;
+uint32_t pivot_is_zero = -1;
+for (int row = pivot_row_lower_bound;
+        row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+    uint32_t is_pivot_row = ~ct_compare_32(row, pivot_row);
+    uint32_t below_pivot_row = ct_is_greater_than(row, pivot_row);
+    __m256i mask = _mm256_set1_epi32( is_pivot_row | (below_pivot_row & pivot_is_zero) );
+    for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+        _pivot_row[j] ^= mask & A_avx[row * AVX_REGS_PER_ROW + j];
+    }
+    pivot = pivot_row_bytes[pivot_col];
+    pivot_is_zero = ~ct_compare_32((int) pivot, 0);
+}
+
+/* multiply pivot row by inverse of pivot */
+inverse = inverse_f(pivot);
+__m256i inverse_multab = tbl32_gf16_multab(inverse);
+
+for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+    _pivot_row[j] = _mm256_shuffle_epi8(inverse_multab, _pivot_row[j]);
+}
+
+/* conditionally write pivot row to the correct row, if there is a nonzero pivot */
+/* eliminate entries below pivot */
+for (int row = pivot_row_lower_bound; row < nrows; row++) {
+    unsigned char below_pivot =  (unsigned char) (ct_is_greater_than(row, pivot_row));
+    unsigned char elt_to_elim = A_bytes[row*AVX_REGS_PER_ROW*32 + pivot_col];
+
+    __m256i multab = tbl32_gf16_multab(below_pivot & elt_to_elim);
+    if (row <= pivot_row_upper_bound) {
+        __m256i mask = _mm256_set1_epi32(~ct_compare_32(row, pivot_row) & ~pivot_is_zero);
+        for (int col = pivot_col_rounded; col < AVX_REGS_PER_ROW; col++) { 
+            A_avx[row*AVX_REGS_PER_ROW + col] = _mm256_blendv_epi8(A_avx[row*AVX_REGS_PER_ROW + col], _pivot_row[col], mask) ^
+                                                    _mm256_shuffle_epi8(multab, _pivot_row[col]);
+        }
+    } else {
+        for (int j = pivot_col_rounded; j < AVX_REGS_PER_ROW; j++) {
+            A_avx[row*AVX_REGS_PER_ROW + j] ^= _mm256_shuffle_epi8(multab, _pivot_row[j]);
+        }
+    }
+}
+
+pivot_row += (-(int32_t)(~pivot_is_zero));
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.c b/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mem.h>
+#include <mayo.h>
+#include <randombytes.h>
+#include <aes_ctr.h>
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <fips202.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+    int i;
+    for (i = 0; i < mdeclen / 2; ++i) {
+        *mdec++ = m[i] & 0xf;
+        *mdec++ = m[i] >> 4;
+    }
+
+    if (mdeclen % 2 == 1) {
+        *mdec++ = m[i] & 0x0f;
+    }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+    int i;
+    for (i = 0; i < mlen / 2; ++i, m += 2) {
+        menc[i] = (*m) | (*(m + 1) << 4);
+    }
+
+    if (mlen % 2 == 1) {
+        menc[i] = (*m);
+    }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+
+    const uint64_t *vPv = _vPv;
+    uint64_t temp[M_MAX/16] = {0};
+    unsigned char *temp_bytes = (unsigned char *) temp;
+    int k = 0;
+    for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+        for (int j = i; j < PARAM_k(p); j++) {
+            // multiply by X (shift up 4 bits)
+            unsigned char top = temp[k] >> 60;
+            temp[k] <<= 4;
+            k--;
+            for(; k>=0; k--){
+                temp[k+1] ^= temp[k] >> 60;
+                temp[k] <<= 4;
+            }
+            // reduce mod f(X)
+            for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+                if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+                }
+                else {
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+                }
+            }
+
+            // extract from vPv and add
+            for(k=0; k < PARAM_m(p)/16; k ++){
+                temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+            }
+            k--;
+        }
+    }
+
+    // add to y
+    for (int i = 0; i < PARAM_m(p); i+=2)
+    {
+#ifdef TARGET_BIG_ENDIAN
+        y[i]   = t[i]   ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        y[i]   = t[i]   ^ (temp_bytes[i/2] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+    }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+    static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+    static const uint64_t even_bytes   = 0x00ff00ff00ff00ff;
+    static const uint64_t even_2bytes  = 0x0000ffff0000ffff;
+    static const uint64_t even_half    = 0x00000000ffffffff;
+
+    for (size_t i = 0; i < 16; i+=2)
+    {
+        uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; 
+        M[i  ] ^= t << 4;
+        M[i+1] ^= t;
+    }
+
+    for (size_t i = 0; i < 16; i+=4)
+    {
+        uint64_t t0 = ((M[i  ] >> 8) ^ M[i+2]) & even_bytes; 
+        uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; 
+        M[i  ] ^= (t0 << 8);
+        M[i+1] ^= (t1 << 8);
+        M[i+2] ^= t0;
+        M[i+3] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 4; i++)
+    {
+        uint64_t t0 = ((M[i  ] >> 16) ^ M[i+ 4]) & even_2bytes;
+        uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+        M[i   ] ^= t0 << 16;
+        M[i+ 8] ^= t1 << 16;
+        M[i+ 4] ^= t0;
+        M[i+12] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 8; i++)
+    {
+        uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; 
+        M[i  ] ^= t << 32;
+        M[i+8] ^= t;
+    }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    
+    const uint64_t *VtL = _VtL;
+    int bits_to_shift = 0;
+    int words_to_shift = 0;
+    uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+    size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+    const uint64_t *Mi, *Mj;
+
+    for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+        for (int j = PARAM_k(p) - 1; j >= i; --j) {
+            // add the M_i and M_j to A, shifted "down" by l positions
+            Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+            for (int c = 0; c < PARAM_o(p); c++) {
+                for (int k = 0; k < PARAM_m(p)/16; k++)
+                {
+                    A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+                    if(bits_to_shift > 0){
+                        A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                    }
+                }
+            }
+
+            if (i != j) {
+                Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+                for (int c = 0; c < PARAM_o(p); c++) {
+                    for (int k = 0; k < PARAM_m(p)/16; k++)
+                    {
+                        A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+                        if(bits_to_shift > 0){
+                            A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                        }
+                    }
+                }
+            }
+
+            bits_to_shift += 4;
+            if(bits_to_shift == 64){
+                words_to_shift ++;
+                bits_to_shift = 0;
+            }
+        }
+    }
+
+    for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+    {
+        transpose_16x16_nibbles(A + c);
+    }
+
+    unsigned char tab[F_TAIL_LEN*4] = {0};
+    for (size_t i = 0; i < F_TAIL_LEN; i++)
+    {
+        tab[4*i]   = mul_f(PARAM_f_tail(p)[i],1);
+        tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+        tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+        tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+    }
+
+    uint64_t low_bit_in_nibble = 0x1111111111111111;
+    
+    for (size_t c = 0; c < A_width; c+= 16)
+    {
+        for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+        {
+            size_t pos = (r/16)*A_width + c + (r%16);
+            uint64_t t0 =  A[pos]       & low_bit_in_nibble;
+            uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+            uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+            uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+            for (size_t t = 0; t < F_TAIL_LEN; t++)
+            {
+                A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+            }
+        }
+    }
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) 
+        A[i] = BSWAP64(A[i]);
+#endif
+
+    for (int r = 0; r < PARAM_m(p); r+=16)
+    {
+        for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+        {
+            for (size_t i = 0; i < 16; i++)
+            {
+                decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+            }   
+        }
+    }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+    int ret = 0;
+
+    ret = mayo_keypair_compact(p, pk, sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+    unsigned char y[M_MAX];                    // secret data
+    unsigned char salt[SALT_BYTES_MAX];        // not secret data
+    unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+             Vdec[N_MINUS_O_MAX * K_MAX];                 // secret data
+    unsigned char A[M_MAX * (K_MAX * O_MAX + 1)];   // secret data
+    unsigned char x[K_MAX * N_MAX];                       // not secret data
+    unsigned char r[K_MAX * O_MAX + 1] = { 0 };           // secret data
+    unsigned char s[K_MAX * N_MAX];                       // not secret data
+    const unsigned char *seed_sk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+    alignas(32) sk_t sk;                    // secret data
+    unsigned char Ox[N_MINUS_O_MAX];        // secret data
+    // unsigned char Mdigest[DIGEST_BYTES];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+    unsigned char *ctrbyte;
+    unsigned char *vi;
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_v_bytes = PARAM_v_bytes(p);
+    const int param_r_bytes = PARAM_r_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_A_cols = PARAM_A_cols(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    ret = mayo_expand_sk(p, csk, &sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+    seed_sk = csk;
+    decode(sk.o, O, (param_n - param_o) * param_o);
+
+    // hash message
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    uint64_t *P1 = sk.p;
+    uint64_t *L  = P1 + (param_P1_bytes/8);
+    alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        L[i] = BSWAP64(L[i]);
+    }
+#endif
+
+    // choose the randomizer
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(tmp + param_digest_bytes, param_salt_bytes);
+    #else
+    if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // hashing to salt
+    memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+           param_sk_seed_bytes);
+    shake256(salt, param_salt_bytes, tmp,
+             param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+    // hashing to t
+    memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+    ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+    shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+    decode(tenc, t, param_m); // may not be necessary
+
+    for (int ctr = 0; ctr <= 255; ++ctr) {
+        *ctrbyte = (unsigned char)ctr;
+
+        shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+                 param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+        // decode the v_i vectors
+        for (int i = 0; i <= param_k - 1; ++i) {
+            decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+                   param_n - param_o);
+        }
+
+        // compute all the V * L matrices.
+        // compute all the V * P1 * V^T matrices.
+        alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+        V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+        compute_rhs(p, Y, t, y);
+        compute_A(p, Mtmp, A);
+
+        decode(V + param_k * param_v_bytes, r,
+               param_k *
+               param_o);
+        if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+            break;
+        } else {
+            memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+        }
+    }
+
+    // s is already 0
+    // TODO: optimize this?
+    for (int i = 0; i <= param_k - 1; ++i) {
+        vi = Vdec + i * (param_n - param_o);
+        mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+        mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+        memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+    }
+    encode(s, sig, param_n * param_k);
+    memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+    *siglen = param_sig_bytes;
+err:
+    mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+    mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+    mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+    mayo_secure_clear(r, K_MAX * O_MAX + 1);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(&sk, sizeof(sk_t));
+    mayo_secure_clear(Ox, N_MINUS_O_MAX);
+    mayo_secure_clear(tmp,
+                      DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+    return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    size_t siglen = param_sig_bytes;
+    ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+    if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+        goto err;
+
+    memmove(sm + param_sig_bytes, m, mlen);
+    *smlen = siglen + mlen;
+err:
+    return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk) {
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    if (smlen < (size_t)param_sig_bytes) {
+        return MAYO_ERR;
+    }
+    int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+                             pk);
+
+    if (result == MAYO_OK) {
+        *mlen = smlen - param_sig_bytes;
+        memmove(m, sm + param_sig_bytes, *mlen);
+    }
+
+    return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char *seed_sk = csk;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+    alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+    unsigned char *seed_pk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    // seed_sk $←- B^(sk_seed bytes)
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(seed_sk, param_sk_seed_bytes);
+    #else
+    if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // S ← shake256(seedsk, pk seed bytes + O bytes)
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    // seed_pk ← s[0 : pk_seed_bytes]
+    seed_pk = S;
+
+    // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+    decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+
+    int m_legs = param_m / 32;
+
+    uint64_t *P1 = P;
+    uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+    // compute P3 = O^t * (P1*O + P2)
+    Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+    // store seed_pk in cpk
+    memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+    alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+    // compute Upper(P3) and store in cpk
+    m_upper(m_legs, P3, P3_upper, param_o);
+    
+    memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+    return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *pk) {
+    #ifdef MAYO_VARIANT
+    (void)p;
+    #endif
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+    pk += param_P1_bytes + param_P2_bytes;
+    memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+    return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *sk) {
+    int ret = MAYO_OK;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    uint64_t *P = sk->p;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_v(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    const unsigned char *seed_sk = csk;
+    unsigned char *seed_pk = S;
+
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    decode(S + param_pk_seed_bytes, O,
+           param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+    uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+    
+    uint64_t *P1 = P;
+    // compute L_i = (P1 + P1^t)*O + P2
+    uint64_t *L = P2;
+    P1P1t_times_O(p, P1, O, L);
+
+    // write to sk
+    memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+
+    mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *cpk) {
+    unsigned char tEnc[M_BYTES_MAX];
+    unsigned char t[M_MAX];
+    unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+    unsigned char s[K_MAX * N_MAX];
+    alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+    if (ret != MAYO_OK) {
+        return MAYO_ERR;
+    }
+
+    uint64_t *P1 = pk;
+    uint64_t *P2 = pk + (param_P1_bytes / 8);
+    uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        P2[i] = BSWAP64(P2[i]);
+    }
+    for (int i = 0; i < param_P3_bytes / 8; ++i) {
+        P3[i] = BSWAP64(P3[i]);
+    }
+#endif
+
+    // hash m
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    // compute t
+    memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+           param_salt_bytes);
+    shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+    decode(tEnc, t, param_m);
+
+    // decode s
+    decode(sig, s, param_k * param_n);
+
+    // Compute S*P*S^T
+    alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+    m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+                             param_v, param_o, param_k, SPS);
+
+    // combine the vectors in SPS and reduce mod f(X)
+    compute_rhs(p, SPS, y, y);
+    
+    if (memcmp(y, t, param_m) == 0) {
+        return MAYO_OK; // good signature
+    }
+    return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.h b/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64                                                              \
+  { 8, 0, 2, 8, 0 } // f(z) =  z^64         + x^3*z^3 + x*z^2         + x^3
+#define F_TAIL_96                                                              \
+  { 2, 2, 0, 2, 0 } // f(z) =  z^96         +   x*z^3         + x*z   + x
+#define F_TAIL_128                                                             \
+  { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3         + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+    int m;
+    int n;
+    int o;
+    int k;
+    int q;
+    const unsigned char *f_tail;
+    int m_bytes;
+    int O_bytes;
+    int v_bytes;
+    int r_bytes;
+    int R_bytes;
+    int P1_bytes;
+    int P2_bytes;
+    int P3_bytes;
+    int csk_bytes;
+    int esk_bytes;
+    int cpk_bytes;
+    int epk_bytes;
+    int sig_bytes;
+    int salt_bytes;
+    int sk_seed_bytes;
+    int digest_bytes;
+    int pk_seed_bytes;
+    const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+    uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+    uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ * 
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/mem.h b/src/sig/mayo/pqmayo_mayo-5_avx2/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+    int32_t diff = b - a;
+    return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+    int64_t diff = ((int64_t) b) - ((int64_t) a);
+    return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+    return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+    return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+    return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include <oqs/common.h>
+/**
+ * Clears and frees allocated memory.
+ * 
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+    OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ * 
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+    OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/params.c b/src/sig/mayo/pqmayo_mayo-5_avx2/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+  const mayo_params_t nm = { \
+    .m = PARAM_JOIN2(nm, m), \
+    .n = PARAM_JOIN2(nm, n), \
+    .o = PARAM_JOIN2(nm, o), \
+    .k = PARAM_JOIN2(nm, k), \
+    .q = PARAM_JOIN2(nm, q), \
+    .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+    .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+    .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+    .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+    .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+    .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+    .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+    .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+    .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+    .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+    .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+    .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+    .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+    .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+    .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+    .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+    .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+    .name = #nm \
+  };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_128.h
new file mode 100644
index 0000000000..27b416adce
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_128.h
@@ -0,0 +1,524 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_128_H
+#define SHUFFLE_ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_5_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            cols_used ++;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0); 
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[(2*r*O_MAX) + 2*k]     ^= temp[2*k]     ^ _mm256_slli_epi16(t0,4);
+            acc[(2*r*O_MAX) + 2*k + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+            acc[(2*r*O_MAX) + 2*k + 2] ^= temp[2*k + 1] ^ t0;
+            acc[(2*r*O_MAX) + 2*k + 3] ^= temp[2*k + 3] ^ t1;
+        }
+    }
+}
+
+static 
+inline void mayo_5_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+    const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1O_P2 + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c + 1]  ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(k*O_MAX) + 2*c]      ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*O_MAX) + 2*c]     ^= temp[2*k+1] ^ t0;
+            acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k + 3] ^ t1;
+        }
+    }
+}
+
+
+static
+inline void mayo_5_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        cols_used += 1;
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*pos);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*pos + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            pos += (V_MAX -c - 1);
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k    ));
+            __m256i acc1 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 1));
+            __m256i acc2 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 2));
+            __m256i acc3 = _mm256_loadu_si256(acc + (2*(r*O_MAX) + 2*k + 3));
+
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k),     acc0 ^ temp[2*k] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 2), acc2 ^ temp[2*k+1] ^ t0);
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 1), acc1 ^ temp[2*k+2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256(acc + (2*(r*O_MAX) + 2*k + 3), acc3 ^ temp[2*k+3] ^ t1);
+        }
+    }
+}
+
+
+static 
+inline void mayo_5_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *L = (__m256i *) _L;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(L + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c]     ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*O_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[2*(k*O_MAX) + 2*c + 1]     ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*O_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*k*O_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+        acc[2*k*O_MAX + 2*c + 1] ^= temp[2*k + 2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+static 
+inline void mayo_5_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+static 
+inline void mayo_5_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *Pv = (__m256i *) _Pv;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(Pv + 2*r*K_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*k*K_MAX + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*k*K_MAX + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_5_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+
+        // P1 * S1
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P1 + 2*P1_cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P1 + 2*P1_cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            P1_cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // P2 * S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P2 + 2*r*O_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_5_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P3 = (__m256i *) _P3;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(P3 + 2*cols_used);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(P3 + 2*cols_used + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*(r*K_MAX) + 2*k + 2] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*(r*K_MAX) + 2*k + 3] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(r*K_MAX) + 2*k + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+static
+inline void mayo_5_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_5_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_5_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+    const __m256i *PS2 = (__m256i *) _PS2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c);
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256(PS2 + 2*r*K_MAX + 2*c + 1);
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+            acc[2*((k+1)*K_MAX) + 2*c] ^= temp[2*k+1] ^ t0;
+
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+            acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+            acc[2*((k+1)*K_MAX) + 2*c + 1] ^= temp[2*k+3] ^ t1;
+        }
+#if K_MAX % 2 == 1
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k],4)) & low_nibble_mask;
+        acc[2*(k*K_MAX) + 2*c] ^= temp[2*k] ^ _mm256_slli_epi16(t0,4);
+
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k+2],4)) & low_nibble_mask;
+        acc[2*(k*K_MAX) + 2*c + 1] ^= temp[2*k+2] ^ _mm256_slli_epi16(t1,4);
+#endif
+    }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_64.h
new file mode 100644
index 0000000000..defff86f8f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_64.h
@@ -0,0 +1,481 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_64_H
+#define SHUFFLE_ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_12_P1_times_O_avx2(const uint64_t *_P1, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*O_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*O_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+    }
+}
+
+
+static 
+inline void mayo_12_Ot_times_P1O_P2_avx2(const uint64_t *_P1O_P2, __m256i *O_multabs, uint64_t *_acc){
+
+    const __m256i *P1O_P2 = (__m256i *) _P1O_P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1O_P2 + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*O_MAX) + c]     ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+        }
+    }
+}
+
+static
+inline void mayo_12_P1P1t_times_O(const uint64_t *_P1, const unsigned char *O, uint64_t *_acc){
+
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[O_MAX] = {0};
+        cols_used += 1;
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + pos);
+            pos += (V_MAX -c - 1);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd);
+                temp[k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even);
+            }            
+        }
+
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256(acc + (r*O_MAX + k    ));
+            __m256i acc1 = _mm256_loadu_si256(acc + (r*O_MAX + k + 1));
+
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256(acc + (r*O_MAX + k    ), acc0 ^ temp[k  ] ^ _mm256_slli_epi16(t,4));
+            _mm256_storeu_si256(acc + (r*O_MAX + k + 1), acc1 ^ temp[k+1] ^ t);
+        }
+    }
+}
+
+
+static 
+inline void mayo_12_Vt_times_L_avx2(const uint64_t *_L, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *L = (__m256i *) _L;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(L + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*O_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*O_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*O_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+static 
+inline void mayo_12_Vt_times_Pv_avx2(const uint64_t *_Pv, const __m256i *V_multabs, uint64_t *_acc){
+
+    const __m256i *Pv = (__m256i *) _Pv;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(Pv + r*K_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+static 
+inline void mayo_12_P1_times_Vt_avx2(const uint64_t *_P1, __m256i *V_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+// P1*S1 -> P1: v x v, S1: v x k // P1 upper triangular
+// same as mayo_12_P1_times_Vt_avx2
+static
+inline void mayo_12_P1_times_S1_avx2(const uint64_t *_P1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_12_P1_times_Vt_avx2(_P1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S1t_times_PS1_avx2(const uint64_t *_PS1, __m256i *S1_multabs, uint64_t *_acc){
+    mayo_12_Vt_times_Pv_avx2(_PS1, S1_multabs, _acc);
+}
+
+static
+inline void mayo_12_S2t_times_PS2_avx2(const uint64_t *_PS2, __m256i *S2_multabs, uint64_t *_acc){
+    const __m256i *PS2 = (__m256i *) _PS2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+    size_t k;
+
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(PS2 + r*K_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(k*K_MAX) + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[((k+1)*K_MAX) + c] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[k*K_MAX + c] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_12_P2_times_S2_avx2(const uint64_t *_P2, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P2 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_12_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *_P1, const uint64_t *_P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P1 = (__m256i *) _P1;
+    const __m256i *P2 = (__m256i *) _P2;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+
+        // P1 * S1
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P1 + P1_cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            P1_cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // P2 * S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P2 + r*O_MAX + c);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_12_P3_times_S2_avx2(const uint64_t *_P3, __m256i *S2_multabs, uint64_t *_acc){
+    size_t k,c;
+    const __m256i *P3 = (__m256i *) _P3;
+    __m256i *acc = (__m256i *) _acc;
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2] = {0};
+
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd = _mm256_loadu_si256(P3 + cols_used);
+            __m256i in_even = _mm256_srli_epi16(in_odd, 4) & low_nibble_mask;
+            in_odd &= low_nibble_mask;
+            cols_used ++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k + 1 < K_MAX; k+=2)
+        {
+            __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+            acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+            acc[(r*K_MAX) + k + 1] ^= temp[k+1] ^ t;
+        }
+#if K_MAX % 2 == 1
+        __m256i t = (temp[k + 1] ^ _mm256_srli_epi16(temp[k],4)) & low_nibble_mask;
+        acc[(r*K_MAX) + k] ^= temp[k] ^ _mm256_slli_epi16(t,4);
+#endif
+    }
+}
+
+
+static inline
+void mayo12_m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+    (void) size;
+    int m_vecs_stored = 0;
+
+    for (int r = 0; r < O_MAX; ++r) {
+        const __m256i* _in = (const __m256i*) (in + m_legs * 2 * (r * size + r));
+        __m256i* _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+        _out[0] = _in[0];
+        m_vecs_stored++;
+        for (int c = r + 1; c < O_MAX; ++c) {
+            const __m256i* _in2 = (const __m256i*) (in + m_legs * 2 * (r * size + c));
+            const __m256i* _in3 = (const __m256i*) (in + m_legs * 2 * (c * size + r));
+            _out = (__m256i*) (out + m_legs * 2 * m_vecs_stored);
+            _out[0] = _in2[0] ^ _in3[0];
+            m_vecs_stored++;
+        }
+    }
+}
+
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_96.h
new file mode 100644
index 0000000000..9b3a69d567
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/shuffle_arithmetic_96.h
@@ -0,0 +1,550 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SHUFFLE_ARITHMETIC_96_H
+#define SHUFFLE_ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <immintrin.h>
+#include <arithmetic_common.h>
+
+
+// P1*0 -> P1: v x v, O: v x o
+static 
+inline void mayo_3_P1_times_O_avx2(const uint64_t *P1, __m256i *O_multabs, uint64_t *acc){
+
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static 
+inline void mayo_3_Ot_times_P1O_P2_avx2(const uint64_t *P1O_P2, __m256i *O_multabs, uint64_t *acc){
+    
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1O_P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*r + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static
+inline void mayo_3_P1P1t_times_O(const uint64_t *P1, const unsigned char *O, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        cols_used ++;
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[2*O_MAX] = {0};
+        size_t pos = r;
+        for (size_t c = 0; c < r; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + pos * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            pos += (V_MAX -c - 1);
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        for (size_t c = r+1; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used ++;
+
+            for (size_t k = 0; k < O_MAX; k+=2)
+            {
+                temp[2*k]     ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd0);
+                temp[2*k + 1] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even0);
+                temp[2*k + 2] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_odd1);
+                temp[2*k + 3] ^= _mm256_shuffle_epi8(O_multabs[O_MAX/2*c + k/2], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (size_t k = 0; k < O_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*O_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+    }
+}
+
+static 
+inline void mayo_3_Vt_times_L_avx2(const uint64_t *L, const __m256i *V_multabs, uint64_t *acc){
+
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < O_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(L + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*O_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*O_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static 
+inline void mayo_3_P1_times_Vt_avx2(const uint64_t *P1, __m256i *V_multabs, uint64_t *acc){
+      const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c = r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static 
+inline void mayo_3_Vt_times_Pv_avx2(const uint64_t *Pv, const __m256i *V_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < V_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(Pv + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(V_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+// P2*S2 -> P2: v x o, S2: o x k
+static 
+inline void mayo_3_P1_times_S1_plus_P2_times_S2_avx2(const uint64_t *P1, const uint64_t *P2, __m256i *S1_multabs, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t P1_cols_used = 0;
+    for (size_t r = 0; r < V_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        // P1 times S1
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c=r; c < V_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P1 + P1_cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            P1_cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S1_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // P2 times S2
+        for (c=0; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P2 + (r*O_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+// P3*S2 -> P3: o x o, S2: o x k // P3 upper triangular
+static 
+inline void mayo_3_P3_times_S2_avx2(const uint64_t *P3, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k,c;
+    size_t cols_used = 0;
+    for (size_t r = 0; r < O_MAX; r++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (c=r; c < O_MAX; c++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(P3 + cols_used * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+            cols_used++;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*c + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k + 1)* 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (r*K_MAX + k)* 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+static
+inline void mayo_3_S1t_times_PS1_avx2(const uint64_t *PS1, __m256i *S1_multabs, uint64_t *acc){
+    mayo_3_Vt_times_Pv_avx2(PS1, S1_multabs, acc);
+}
+
+static
+inline void mayo_3_S2t_times_PS2_avx2(const uint64_t *PS2, __m256i *S2_multabs, uint64_t *acc){
+    const __m256i low_nibble_mask  = _mm256_set_epi64x(0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f);
+
+    size_t k;
+    for (size_t c = 0; c < K_MAX; c++)
+    {
+        // do multiplications for one row and accumulate results in temporary format
+        __m256i temp[K_OVER_2*2*2] = {0};
+        for (size_t r = 0; r < O_MAX; r++)
+        {
+            __m256i in_odd0 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6)); // first 64 field elements 
+            __m256i in_even0 = _mm256_srli_epi16(in_odd0, 4) & low_nibble_mask;
+            in_odd0 &= low_nibble_mask;
+            __m256i in_odd1 = _mm256_loadu_si256((__m256i *)(PS2 + (r*K_MAX + c) * 6 + 2)); // last 64 field elements (#32 to #96) 
+            __m256i in_even1 = _mm256_srli_epi16(in_odd1, 4) & low_nibble_mask;
+            in_odd1 &= low_nibble_mask;
+
+            for (k = 0; k < K_OVER_2; k++)
+            {
+                temp[4*k]     ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd0);
+                temp[4*k + 1] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even0);
+                temp[4*k + 2] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_odd1);
+                temp[4*k + 3] ^= _mm256_shuffle_epi8(S2_multabs[K_OVER_2*r + k], in_even1);
+            }            
+        }
+
+        // convert to normal format and add to accumulator 
+        for (k = 0; k+1 < K_MAX; k+=2)
+        {
+            __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+            __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+            __m256i acc2 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6));
+            __m256i acc3 = _mm256_loadu_si256((__m256i *)(acc + ((k+1)*K_MAX + c)* 6 + 2));
+
+            __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+            __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+            _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6),     acc2 ^ temp[2*k + 1] ^ t0);
+            _mm256_storeu_si256((__m256i *)(acc + ((k+1)*K_MAX + c) * 6 + 2), acc3 ^ temp[2*k + 3] ^ t1);
+        }
+#if K_MAX % 2 == 1
+        __m256i acc0 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6));
+        __m256i acc1 = _mm256_loadu_si256((__m256i *)(acc + (k*K_MAX + c)* 6 + 2));
+
+        __m256i t0 = (temp[2*k + 1] ^ _mm256_srli_epi16(temp[2*k    ],4)) & low_nibble_mask;
+        __m256i t1 = (temp[2*k + 3] ^ _mm256_srli_epi16(temp[2*k + 2],4)) & low_nibble_mask;
+
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6),         acc0 ^ temp[2*k    ] ^ _mm256_slli_epi16(t0,4));
+        _mm256_storeu_si256((__m256i *)(acc + (k*K_MAX + c) * 6 + 2),     acc1 ^ temp[2*k + 2] ^ _mm256_slli_epi16(t1,4));
+#endif
+    }
+}
+
+#undef K_OVER_2
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_avx2/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-5_avx2/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_avx2/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+    // carryless multiply
+    unsigned char p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    unsigned char top_p = p & 0xf0;
+    unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+    return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+    // carryless multiply
+    uint64_t p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+    uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+    return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+    return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+    // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+    // 10, 4, 3, 8}; return table[a & 15];
+
+    unsigned char a2 = mul_f(a, a);
+    unsigned char a4 = mul_f(a2, a2);
+    unsigned char a8 = mul_f(a4, a4);
+    unsigned char a6 = mul_f(a2, a4);
+    unsigned char a14 = mul_f(a8, a6);
+
+    return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+                                    const unsigned char *b, int n, int m) {
+    unsigned char ret = 0;
+    for (int i = 0; i < n; ++i, b += m) {
+        ret = add_f(mul_f(a[i], *b), ret);
+    }
+    return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int colrow_ab, int row_a, int col_b) {
+    for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+        for (int j = 0; j < col_b; ++j, ++c) {
+            *c = lincomb(a, b + j, colrow_ab, col_b);
+        }
+    }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int m, int n) {
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+        }
+    }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+                                 uint64_t *out) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        out[i] = in[i];
+    }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+                                uint64_t *acc) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        acc[i] ^= in[i];
+    }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+    uint32_t a_msb;
+    uint32_t a32 = a;
+    uint32_t b32 = b;
+    uint32_t r32 = a32 * (b32 & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 1) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 2) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 3) & 1);
+
+    return r32;
+
+}
+ 
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/LICENSE b/src/sig/mayo/pqmayo_mayo-5_opt/LICENSE
new file mode 100644
index 0000000000..8f71f43fee
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/LICENSE
@@ -0,0 +1,202 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "{}"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright {yyyy} {name of copyright owner}
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/NOTICE b/src/sig/mayo/pqmayo_mayo-5_opt/NOTICE
new file mode 100644
index 0000000000..53da47c21c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/NOTICE
@@ -0,0 +1,13 @@
+Copyright 2023 the MAYO team. All rights reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
\ No newline at end of file
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/aes_ctr.h b/src/sig/mayo/pqmayo_mayo-5_opt/aes_ctr.h
new file mode 100644
index 0000000000..8e41c30f2c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/aes_ctr.h
@@ -0,0 +1,30 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef AESCTR_H
+#define AESCTR_H
+
+#include <stddef.h>
+#include <stdint.h>
+
+void AES_256_ECB(const uint8_t *input, const uint8_t *key, uint8_t *output);
+#define AES_ECB_encrypt AES_256_ECB
+
+#ifdef ENABLE_AESNI
+int AES_128_CTR_NI(unsigned char *output, size_t outputByteLen,
+                   const unsigned char *input, size_t inputByteLen);
+int AES_128_CTR_4R_NI(unsigned char *output, size_t outputByteLen,
+                      const unsigned char *input, size_t inputByteLen);
+#define AES_128_CTR AES_128_CTR_NI
+#else
+#include <aes.h>
+static inline int AES_128_CTR(unsigned char *output, size_t outputByteLen,
+                const unsigned char *input, size_t inputByteLen) {
+    (void) inputByteLen;
+    uint8_t iv[12] = { 0 };
+    aes128ctr_prf(output, outputByteLen, input, iv);
+    return (int) outputByteLen;
+}
+#endif
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/api.c b/src/sig/mayo/pqmayo_mayo-5_opt/api.c
new file mode 100644
index 0000000000..f2e861e9c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/api.c
@@ -0,0 +1,46 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <api.h>
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define MAYO_PARAMS &MAYO_5
+#else
+#define MAYO_PARAMS 0
+#endif
+
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk) {
+    return mayo_keypair(MAYO_PARAMS, pk, sk);
+}
+
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk) {
+    return mayo_sign(MAYO_PARAMS, sm, smlen, m, mlen, sk);
+}
+
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk) {
+    return mayo_sign_signature(MAYO_PARAMS, sig, siglen, m, mlen, sk);
+}
+
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk) {
+    return mayo_open(MAYO_PARAMS, m, mlen, sm, smlen, pk);
+}
+
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk) {
+    if (siglen != CRYPTO_BYTES)
+        return -1;
+    return mayo_verify(MAYO_PARAMS, m, mlen, sig, pk);
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/api.h b/src/sig/mayo/pqmayo_mayo-5_opt/api.h
new file mode 100644
index 0000000000..404d185c08
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/api.h
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef api_h
+#define api_h
+
+#include <mayo.h>
+
+#define CRYPTO_SECRETKEYBYTES 40
+#define CRYPTO_PUBLICKEYBYTES 5008
+#define CRYPTO_BYTES 838
+
+#define CRYPTO_ALGNAME "MAYO-5"
+
+#define crypto_sign_keypair MAYO_NAMESPACE(crypto_sign_keypair)
+int
+crypto_sign_keypair(unsigned char *pk, unsigned char *sk);
+
+#define crypto_sign MAYO_NAMESPACE(crypto_sign)
+int
+crypto_sign(unsigned char *sm, size_t *smlen,
+            const unsigned char *m, size_t mlen,
+            const unsigned char *sk);
+
+#define crypto_sign_signature MAYO_NAMESPACE(crypto_sign_signature)
+int
+crypto_sign_signature(unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+#define crypto_sign_open MAYO_NAMESPACE(crypto_sign_open)
+int
+crypto_sign_open(unsigned char *m, size_t *mlen,
+                 const unsigned char *sm, size_t smlen,
+                 const unsigned char *pk);
+
+#define crypto_sign_verify MAYO_NAMESPACE(crypto_sign_verify)
+int
+crypto_sign_verify(const unsigned char *sig, size_t siglen,
+                   const unsigned char *m, size_t mlen,
+                   const unsigned char *pk);
+
+#endif /* api_h */
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.c b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.c
new file mode 100644
index 0000000000..de09954c8a
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.c
@@ -0,0 +1,327 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <arithmetic_common.h>
+#include <mem.h>
+#include <echelon_form.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size) {
+#if defined(MAYO_VARIANT) && MAYO_AVX && (M_MAX == 64)
+    mayo12_m_upper(m_legs, in, out, size);
+#else
+    int m_vecs_stored = 0;
+    for (int r = 0; r < size; r++) {
+        for (int c = r; c < size; c++) {
+            m_vec_copy(m_legs, in + m_legs * 2 * (r * size + c), out + m_legs * 2 * m_vecs_stored );
+            if (r != c) {
+                m_vec_add(m_legs, in + m_legs * 2 * (c * size + r), out + m_legs * 2 * m_vecs_stored );
+            }
+            m_vecs_stored ++;
+        }
+    }
+#endif
+}
+
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* acc){
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    (void) p;
+    mayo_12_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    (void) p;   
+    mayo_3_P1P1t_times_O(P1, O, acc);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    (void) p;   
+    mayo_5_P1P1t_times_O(P1, O, acc);
+#else
+    #ifndef MAYO_VARIANT
+    const int m_legs = PARAM_m(p) / 32;
+    #else
+    (void) p;
+    #endif
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_n(p) - PARAM_o(p);;
+
+    int 
+    bs_mat_entries_used = 0;
+    for (int r = 0; r < param_v; r++) {
+        for (int c = r; c < param_v; c++) {
+            if(c==r) {
+                bs_mat_entries_used += 1;
+                continue;
+            }
+            for (int k = 0; k < param_o; k += 1) {
+                
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_mul_add_64(P1 + 4 * bs_mat_entries_used, O[c * param_o + k], acc + 4 * (r * param_o + k));
+                vec_mul_add_64( P1 + 4 * bs_mat_entries_used, O[r * param_o + k],  acc + 4 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[c * param_o + k],  acc + 6 * (r * param_o + k));
+                vec_mul_add_96( P1 + 6 * bs_mat_entries_used, O[r * param_o + k],  acc + 6 * (c * param_o + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[c * param_o + k],  acc + 8 * (r * param_o + k));
+                vec_mul_add_128( P1 + 8 * bs_mat_entries_used, O[r * param_o + k],  acc + 8 * (c * param_o + k));
+#else
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[c * param_o + k], acc + m_legs * 2 * (r * param_o + k));
+                m_vec_mul_add(m_legs, P1 + m_legs * 2 * bs_mat_entries_used, O[r * param_o + k], acc + m_legs * 2 * (c * param_o + k));
+#endif
+
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+#endif
+}
+
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_12_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_12_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_12_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_3_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_3_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_3_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i V_multabs[(K_MAX+1)/2*V_MAX];
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mayo_V_multabs_avx2(V, V_multabs);
+    mayo_5_Vt_times_L_avx2(L, V_multabs, M);
+    mayo_5_P1_times_Vt_avx2(P1, V_multabs, Pv);
+    mayo_5_Vt_times_Pv_avx2(Pv, V_multabs, Y);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+
+    alignas (32) uint64_t Pv[N_MINUS_O_MAX * K_MAX * M_MAX / 16] = {0};
+    mul_add_mat_x_m_mat(param_m / 32, V, L, M,
+        param_k, param_n - param_o, param_o);
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(param_m / 32, P1, V, Pv, param_n - param_o, param_n - param_o, param_k, 1);
+
+    mul_add_mat_x_m_mat(param_m / 32, V, Pv,
+        Y, param_k, param_n - param_o,
+        param_k);
+#endif
+}
+
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3) {
+    (void) p;
+#if MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 64
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_12_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_12_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 96
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_3_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_3_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#elif MAYO_AVX && defined(MAYO_VARIANT) && M_MAX == 128
+    __m256i O_multabs[O_MAX/2*V_MAX];
+    mayo_O_multabs_avx2(O, O_multabs);
+    mayo_5_P1_times_O_avx2(P1, O_multabs, P1O_P2);
+    mayo_5_Ot_times_P1O_P2_avx2(P1O_P2, O_multabs, P3);
+#else
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int m_legs = PARAM_m(p) / 32;
+    mul_add_m_upper_triangular_mat_x_mat(param_m/32, P1, O, P1O_P2, param_n - param_o, param_n - param_o, param_o, 1);
+    mul_add_mat_trans_x_m_mat(m_legs, O, P1O_P2, P3, param_v, param_o, param_o);
+#endif
+}
+
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+// compute S * PS  = [ S1 S2 ] * [ P1*S1 + P2*S2 = P1 ] = [ S1*P1 + S2*P2 ]
+//                               [         P3*S2 = P2 ]
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS) {
+    (void) m;
+#if MAYO_AVX
+    const int n = o + v;
+
+    /* Old approach which is constant time but doesn't have to be */
+    unsigned char S1[V_MAX*K_MAX] = { 0 }; // == N-O, K
+    unsigned char S2[O_MAX*K_MAX] = { 0 }; // == O, K
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+
+#if M_MAX == 64
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_12_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_12_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_12_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_12_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 96
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+
+    mayo_3_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_3_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    mayo_3_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_3_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#elif M_MAX == 128
+    __m256i S1_multabs[(K_MAX+1)/2*V_MAX];    
+    __m256i S2_multabs[(K_MAX+1)/2*O_MAX];
+    mayo_S1_multabs_avx2(S1, S1_multabs);
+    mayo_S2_multabs_avx2(S2, S2_multabs);
+    mayo_5_P1_times_S1_plus_P2_times_S2_avx2(P1, P2, S1_multabs, S2_multabs, PS);
+    mayo_5_P3_times_S2_avx2(P3, S2_multabs, PS + V_MAX*K_MAX*M_MAX/16); // upper triangular
+
+    // S^T * PS = S1^t*PS1 + S2^t*PS2
+    //m_calculate_SPS(PS, S, M_MAX, K_MAX, N_MAX, SPS);
+    mayo_5_S1t_times_PS1_avx2(PS, S1_multabs, SPS);
+    mayo_5_S2t_times_PS2_avx2(PS + V_MAX*K_MAX*M_MAX/16, S2_multabs, SPS);
+#else 
+    NOT IMPLEMENTED
+#endif
+#else
+    const int n = o + v;
+    alignas (32) uint64_t PS[N_MAX * K_MAX * M_MAX / 16] = { 0 };
+    mayo_generic_m_calculate_PS(P1, P2, P3, S, m, v, o, k, PS);
+    mayo_generic_m_calculate_SPS(PS, S, m, k, n, SPS);
+#endif
+}
+
+
+// sample a solution x to Ax = y, with r used as randomness
+// require:
+// - A is a matrix with m rows and k*o+1 collumns (values in the last collum are
+// not important, they will be overwritten by y) in row major order
+// - y is a vector with m elements
+// - r and x are k*o bytes long
+// return: 1 on success, 0 on failure
+int sample_solution(const mayo_params_t *p, unsigned char *A,
+                           const unsigned char *y, const unsigned char *r,
+                           unsigned char *x, int k, int o, int m, int A_cols) {
+    #ifdef MAYO_VARIANT
+    (void) p;
+    #endif
+    unsigned char finished;
+    int col_upper_bound;
+    unsigned char correct_column;
+
+    // x <- r
+    for (int i = 0; i < k * o; i++) {
+        x[i] = r[i];
+    }
+
+    // compute Ar;
+    unsigned char Ar[M_MAX];
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = 0; // clear last col of A
+    }
+    mat_mul(A, r, Ar, k * o + 1, m, 1);
+
+    // move y - Ar to last column of matrix A
+    for (int i = 0; i < m; i++) {
+        A[k * o + i * (k * o + 1)] = sub_f(y[i], Ar[i]);
+    }
+
+    EF(A, m, k * o + 1);
+
+    // check if last row of A (excluding the last entry of y) is zero
+    unsigned char full_rank = 0;
+    for (int i = 0; i < A_cols - 1; i++) {
+        full_rank |= A[(m - 1) * A_cols + i];
+    }
+
+// It is okay to leak if we need to restart or not
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(&full_rank, 1);
+#endif
+
+    if (full_rank == 0) {
+        return 0;
+    }
+
+    // back substitution in constant time
+    // the index of the first nonzero entry in each row is secret, which makes
+    // things less efficient
+
+    for (int row = m - 1; row >= 0; row--) {
+        finished = 0;
+        col_upper_bound = MAYO_MIN(row + (32/(m-row)), k*o);
+        // the first nonzero entry in row r is between r and col_upper_bound with probability at least ~1-q^{-32}
+
+        for (int col = row; col <= col_upper_bound; col++) {
+
+            // Compare two chars in constant time.
+            // Returns 0x00 if the byte arrays are equal, 0xff otherwise.
+            correct_column = ct_compare_8((A[row * A_cols + col]), 0) & ~finished;
+
+            unsigned char u = correct_column & A[row * A_cols + A_cols - 1];
+            x[col] ^= u;
+
+            for (int i = 0; i < row; i += 8) {
+                uint64_t tmp = ( (uint64_t) A[ i    * A_cols + col] <<  0) ^ ( (uint64_t) A[(i+1) * A_cols + col] <<  8)
+                             ^ ( (uint64_t) A[(i+2) * A_cols + col] << 16) ^ ( (uint64_t) A[(i+3) * A_cols + col] << 24)
+                             ^ ( (uint64_t) A[(i+4) * A_cols + col] << 32) ^ ( (uint64_t) A[(i+5) * A_cols + col] << 40)
+                             ^ ( (uint64_t) A[(i+6) * A_cols + col] << 48) ^ ( (uint64_t) A[(i+7) * A_cols + col] << 56);
+
+                tmp = mul_fx8(u, tmp);
+
+                A[ i    * A_cols + A_cols - 1] ^= (tmp      ) & 0xf;
+                A[(i+1) * A_cols + A_cols - 1] ^= (tmp >> 8 ) & 0xf;
+                A[(i+2) * A_cols + A_cols - 1] ^= (tmp >> 16) & 0xf;
+                A[(i+3) * A_cols + A_cols - 1] ^= (tmp >> 24) & 0xf;
+                A[(i+4) * A_cols + A_cols - 1] ^= (tmp >> 32) & 0xf;
+                A[(i+5) * A_cols + A_cols - 1] ^= (tmp >> 40) & 0xf;
+                A[(i+6) * A_cols + A_cols - 1] ^= (tmp >> 48) & 0xf;
+                A[(i+7) * A_cols + A_cols - 1] ^= (tmp >> 56) & 0xf;
+            }
+
+            finished = finished | correct_column;
+        }
+    }
+    return 1;
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.h b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.h
new file mode 100644
index 0000000000..c2fb4fe334
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic.h
@@ -0,0 +1,62 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_H
+#define ARITHMETIC_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <stdint.h>
+
+#if defined(__BYTE_ORDER__) && (__BYTE_ORDER__ == __ORDER_BIG_ENDIAN__)
+#define TARGET_BIG_ENDIAN
+#endif
+
+#if defined(MAYO_AVX) && (M_MAX == 64)
+    #include <shuffle_arithmetic_64.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 96)
+    #include <shuffle_arithmetic_96.h>
+#endif
+#if defined(MAYO_AVX) && (M_MAX == 128)
+    #include <shuffle_arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 64))
+    #include <arithmetic_64.h>
+    #include <arithmetic_96.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 96))
+    #include <arithmetic_96.h>
+    #include <arithmetic_128.h>
+#endif
+#if !defined(MAYO_VARIANT) || (defined(MAYO_VARIANT) && (M_MAX == 128))
+    #include <arithmetic_128.h>
+#endif
+
+// Calculate P3 = O^T * (P1*O + P2) in KeyGen
+#define Ot_times_P1O_P2 MAYO_NAMESPACE(Ot_times_P1O_P2)
+void Ot_times_P1O_P2(const mayo_params_t* p, const uint64_t* P1, const unsigned char* O, uint64_t* P1O_P2, uint64_t* P3);
+
+// Calculate Upper in KeyGen
+#define m_upper MAYO_NAMESPACE(m_upper)
+void m_upper(int m_legs, const uint64_t *in, uint64_t *out, int size);
+
+// Calculate acc = (P1+P1^T)*O in expand_sk
+#define P1P1t_times_O MAYO_NAMESPACE(P1P1t_times_O)
+void P1P1t_times_O(const mayo_params_t* p, const uint64_t* P1P1t, const unsigned char* O, uint64_t* acc);
+
+// Calculate M=V*L and Y=V*P1*V^T in Sign
+#define V_times_L__V_times_P1_times_Vt MAYO_NAMESPACE(V_times_L__V_times_P1_times_Vt)
+void V_times_L__V_times_P1_times_Vt(const mayo_params_t* p, const uint64_t* L, const unsigned char* V, uint64_t* M, const uint64_t* P1, uint64_t* Y);
+
+// Sample solution in Sign
+#define sample_solution MAYO_NAMESPACE(sample_solution)
+int sample_solution(const mayo_params_t *p, unsigned char *A, const unsigned char *y, const unsigned char *r, unsigned char *x, int k, int o, int m, int A_cols);
+
+// Calculate SPS = S*P*S^T in Verify
+#define m_calculate_PS_SPS MAYO_NAMESPACE(m_calculate_PS_SPS)
+void m_calculate_PS_SPS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *SPS);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_128.h b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_128.h
new file mode 100644
index 0000000000..418c308e2f
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_128.h
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_128_H
+#define ARITHMETIC_128_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_64.h>
+
+// This implements arithmetic for vectors of 128 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_128(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+    out[6] = in[6];
+    out[7] = in[7];
+}
+
+
+static
+inline void vec_add_128(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+    acc[6] ^= in[6];
+    acc[7] ^= in[7];
+}
+
+inline
+static void m_vec_mul_add_x_128(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 8;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_128(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 8;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void vec_mul_add_128(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 8;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void multiply_bins_128(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_128(bins +  5 * 8, bins +  10 * 8);
+    m_vec_mul_add_x_128(bins + 11 * 8, bins + 12 * 8);
+    m_vec_mul_add_x_inv_128(bins +  10 * 8, bins +  7 * 8);
+    m_vec_mul_add_x_128(bins + 12 * 8, bins +  6 * 8);
+    m_vec_mul_add_x_inv_128(bins +  7 * 8, bins +  14 * 8);
+    m_vec_mul_add_x_128(bins +  6 * 8, bins +  3 * 8);
+    m_vec_mul_add_x_inv_128(bins +  14 * 8, bins +  15 * 8);
+    m_vec_mul_add_x_128(bins +  3 * 8, bins +  8 * 8);
+    m_vec_mul_add_x_inv_128(bins +  15 * 8, bins +  13 * 8);
+    m_vec_mul_add_x_128(bins +  8 * 8, bins +  4 * 8);
+    m_vec_mul_add_x_inv_128(bins +  13 * 8, bins +  9 * 8);
+    m_vec_mul_add_x_128(bins +  4 * 8, bins +  2 * 8);
+    m_vec_mul_add_x_inv_128(bins +   9 * 8, bins +  1 * 8);
+    m_vec_mul_add_x_128(bins +  2 * 8, bins +  1 * 8);
+    vec_copy_128(bins + 8, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_64.h b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_64.h
new file mode 100644
index 0000000000..a70b7a3118
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_64.h
@@ -0,0 +1,128 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_64_H
+#define ARITHMETIC_64_H
+
+#include <stdint.h>
+#include <mayo.h>
+
+// This implements arithmetic for vectors of 64 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_64(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+}
+
+static
+inline void vec_add_64(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+}
+
+static inline uint64_t gf16v_mul_u64( uint64_t a, uint8_t b ) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    uint64_t a_msb;
+    uint64_t a64 = a;
+    uint64_t b32 = b;
+    uint64_t r64 = a64 * (b32 & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 1) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 2) & 1);
+
+    a_msb = a64 & mask_msb; // MSB, 3rd bits
+    a64 ^= a_msb;   // clear MSB
+    a64 = (a64 << 1) ^ ((a_msb >> 3) * 3);
+    r64 ^= (a64) * ((b32 >> 3) & 1);
+
+    return r64;
+}
+
+static inline uint32_t mul_table(uint8_t b){
+    uint32_t x = ((uint32_t) b) * 0x08040201;
+
+    uint32_t high_nibble_mask = 0xf0f0f0f0;
+
+    uint32_t high_half = x & high_nibble_mask;
+    return (x ^ (high_half >> 4) ^ (high_half >> 3));
+}
+
+static 
+inline void vec_mul_add_64(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 4;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void vec_mul_add_u64(const int legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < legs; i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_64(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 4;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_64(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 4;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void multiply_bins_64(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_64(bins +  5 * 4, bins +  10 * 4);
+    m_vec_mul_add_x_64(bins + 11 * 4, bins + 12 * 4);
+    m_vec_mul_add_x_inv_64(bins +  10 * 4, bins +  7 * 4);
+    m_vec_mul_add_x_64(bins + 12 * 4, bins +  6 * 4);
+    m_vec_mul_add_x_inv_64(bins +  7 * 4, bins +  14 * 4);
+    m_vec_mul_add_x_64(bins +  6 * 4, bins +  3 * 4);
+    m_vec_mul_add_x_inv_64(bins +  14 * 4, bins +  15 * 4);
+    m_vec_mul_add_x_64(bins +  3 * 4, bins +  8 * 4);
+    m_vec_mul_add_x_inv_64(bins +  15 * 4, bins +  13 * 4);
+    m_vec_mul_add_x_64(bins +  8 * 4, bins +  4 * 4);
+    m_vec_mul_add_x_inv_64(bins +  13 * 4, bins +  9 * 4);
+    m_vec_mul_add_x_64(bins +  4 * 4, bins +  2 * 4);
+    m_vec_mul_add_x_inv_64(bins +   9 * 4, bins +  1 * 4);
+    m_vec_mul_add_x_64(bins +  2 * 4, bins +  1 * 4);
+    vec_copy_64(bins + 4, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_96.h b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_96.h
new file mode 100644
index 0000000000..a38f89e454
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_96.h
@@ -0,0 +1,85 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_96_H
+#define ARITHMETIC_96_H
+
+#include <stdint.h>
+#include <mayo.h>
+#include <arithmetic_64.h>
+
+// This implements arithmetic for vectors of 96 field elements in Z_2[x]/(x^4+x+1)
+
+static
+inline void vec_copy_96(const uint64_t *in, uint64_t *out) {
+    out[0] = in[0];
+    out[1] = in[1];
+    out[2] = in[2];
+    out[3] = in[3];
+    out[4] = in[4];
+    out[5] = in[5];
+}
+
+static
+inline void vec_add_96(const uint64_t *in, uint64_t *acc) {
+    acc[0] ^= in[0];
+    acc[1] ^= in[1];
+    acc[2] ^= in[2];
+    acc[3] ^= in[3];
+    acc[4] ^= in[4];
+    acc[5] ^= in[5];
+}
+
+inline
+static void m_vec_mul_add_x_96(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_msb = 0x8888888888888888ULL;
+    for(int i=0; i < 6;i++){
+        uint64_t t = in[i] & mask_msb;
+        acc[i] ^= ((in[i] ^ t) << 1) ^ ((t >> 3) * 3);
+    }
+}
+
+inline
+static void m_vec_mul_add_x_inv_96(const uint64_t *in, uint64_t *acc) {
+    uint64_t mask_lsb = 0x1111111111111111ULL;
+    for(int i=0; i < 6;i++){
+        uint64_t t = in[i] & mask_lsb;
+        acc[i] ^= ((in[i] ^ t) >> 1) ^ (t * 9);
+    }
+}
+
+static 
+inline void vec_mul_add_96(const uint64_t *in, unsigned char a, uint64_t *acc) {
+    uint32_t tab = mul_table(a);
+
+    uint64_t lsb_ask = 0x1111111111111111ULL;
+
+    for(int i=0; i < 6;i++){
+        acc[i] ^= ( in[i]       & lsb_ask) * (tab & 0xff)
+                ^ ((in[i] >> 1) & lsb_ask) * ((tab >> 8)  & 0xf)
+                ^ ((in[i] >> 2) & lsb_ask) * ((tab >> 16) & 0xf)
+                ^ ((in[i] >> 3) & lsb_ask) * ((tab >> 24) & 0xf);
+    }
+}
+
+static 
+inline void multiply_bins_96(uint64_t *bins, uint64_t *out) {
+
+    m_vec_mul_add_x_inv_96(bins +  5 * 6, bins +  10 * 6);
+    m_vec_mul_add_x_96(bins + 11 * 6, bins + 12 * 6);
+    m_vec_mul_add_x_inv_96(bins +  10 * 6, bins +  7 * 6);
+    m_vec_mul_add_x_96(bins + 12 * 6, bins +  6 * 6);
+    m_vec_mul_add_x_inv_96(bins +  7 * 6, bins +  14 * 6);
+    m_vec_mul_add_x_96(bins +  6 * 6, bins +  3 * 6);
+    m_vec_mul_add_x_inv_96(bins +  14 * 6, bins +  15 * 6);
+    m_vec_mul_add_x_96(bins +  3 * 6, bins +  8 * 6);
+    m_vec_mul_add_x_inv_96(bins +  15 * 6, bins +  13 * 6);
+    m_vec_mul_add_x_96(bins +  8 * 6, bins +  4 * 6);
+    m_vec_mul_add_x_inv_96(bins +  13 * 6, bins +  9 * 6);
+    m_vec_mul_add_x_96(bins +  4 * 6, bins +  2 * 6);
+    m_vec_mul_add_x_inv_96(bins +   9 * 6, bins +  1 * 6);
+    m_vec_mul_add_x_96(bins +  2 * 6, bins +  1 * 6);
+    vec_copy_96(bins + 6, out);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_common.h b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_common.h
new file mode 100644
index 0000000000..d337bc238c
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/arithmetic_common.h
@@ -0,0 +1,469 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ARITHMETIC_COMMON_H
+#define ARITHMETIC_COMMON_H
+
+#include <stdalign.h>
+
+#ifndef MAYO_VARIANT
+static void m_multiply_bins(const int m_legs, uint64_t *bins, uint64_t *out) {
+
+    m_vec_add(m_legs, bins + 15 * m_legs * 2, bins + 12 * m_legs * 2);
+    m_vec_add(m_legs, bins + 15 * m_legs * 2, bins +  3 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 14 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 14 * m_legs * 2, bins +  6 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 13 * m_legs * 2, bins + 10 * m_legs * 2);
+    m_vec_add(m_legs, bins + 13 * m_legs * 2, bins +  7 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 12 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 12 * m_legs * 2, bins +  4 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 11 * m_legs * 2, bins +  9 * m_legs * 2);
+    m_vec_add(m_legs, bins + 11 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 10 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 10 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 9 * m_legs * 2, bins +  8 * m_legs * 2);
+    m_vec_add(m_legs, bins + 9 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 7 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 7 * m_legs * 2, bins +  3 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 6 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 6 * m_legs * 2, bins +  2 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 5 * m_legs * 2, bins +  4 * m_legs * 2);
+    m_vec_add(m_legs, bins + 5 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_add(m_legs, bins + 3 * m_legs * 2, bins +  2 * m_legs * 2);
+    m_vec_add(m_legs, bins + 3 * m_legs * 2, bins +  1 * m_legs * 2);
+
+    m_vec_mul_add_x(m_legs, bins + 8 * m_legs * 2, bins + 4 * m_legs * 2);
+    m_vec_mul_add_x(m_legs, bins + 4 * m_legs * 2, bins + 2 * m_legs * 2);
+    m_vec_mul_add_x(m_legs, bins + 2 * m_legs * 2, bins + 1 * m_legs * 2);
+
+    m_vec_copy(m_legs, bins + 1 * m_legs * 2, out);
+}
+#endif
+
+// compute P * S^t = [ P1  P2 ] * [S1] = [P1*S1 + P2*S2]
+//                   [  0  P3 ]   [S2]   [        P3*S2]
+static inline void mayo_generic_m_calculate_PS(const uint64_t *P1, const uint64_t *P2, const uint64_t *P3, const unsigned char *S,
+                              const int m, const int v, const int o, const int k, uint64_t *PS) {
+
+    const int n = o + v;
+#if defined(MAYO_VARIANT) && ((M_MAX == 64) || (M_MAX == 96) || M_MAX == 128)
+    (void)m;
+#else
+    const int m_legs = m / 32;
+#endif
+
+    /* Old approach which is constant time but doesn't have to be
+    unsigned char S1[V_MAX*K_MAX];
+    unsigned char S2[O_MAX*K_MAX];
+    unsigned char *s1_write = S1;
+    unsigned char *s2_write = S2;
+    for (int r=0; r < k; r++)
+    {
+        for (int c = 0; c < n; c++)
+        {
+            if(c < v){
+                *(s1_write++) = S[r*n + c];
+            } else {
+                *(s2_write++) = S[r*n + c];
+            }
+        }
+    }
+
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P1, S1, PS, v, v, k, 1); // P1 * S1
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P2, S2, PS, v, o, k, 0); // P2 * S2
+    mul_add_m_upper_triangular_mat_x_mat_trans(m_legs, P3, S2, PS + v*k*m_legs*4, o, o, k, 1); // P3 * S2.
+    */
+
+    // use more stack efficient version for MAYO_3 and MAYO_5
+    #if (defined(HAVE_STACKEFFICIENT) || defined(PQM4)) && N_MAX > 78
+    uint64_t accumulator[M_MAX * N_MAX] = {0};
+    int P1_used;
+    int P3_used;
+    for (int col = 0; col < k; col++) {
+        for(unsigned int i = 0; i < sizeof(accumulator)/8; i++) {
+            accumulator[i] = 0;
+        }
+        P1_used = 0;
+        for (int row = 0; row < v; row++) {
+            for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P1 + P1_used * 4, accumulator + ( row * 16 + S[col * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P1 + P1_used * 6, accumulator + ( row * 16 + S[col * n + j] ) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P1 + P1_used * 8, accumulator + ( row * 16 + S[col * n + j] ) * 8);
+#else
+                bitsliced_m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+                P1_used ++;
+            }
+
+            for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P2 + (row * o + j)*4, accumulator + ( row * 16 + S[(col * n) + j + v] )*4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P2 + (row * o + j)*6, accumulator + ( row * 16 + S[(col * n) + j + v] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P2 + (row * o + j)*8, accumulator + ( row * 16 + S[(col * n) + j + v] )*8 );
+#else
+                bitsliced_m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( row * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+#endif
+            }
+        }
+
+        P3_used = 0;
+        for (int row = v; row < n; row++) {
+            for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                vec_add_64(P3 + P3_used * 4, accumulator + ( row * 16 + S[col * n + j] )*4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                vec_add_96(P3 + P3_used * 6, accumulator + ( row * 16 + S[col * n + j] )*6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                vec_add_128(P3 + P3_used * 8, accumulator + ( row * 16 + S[col * n + j] )*8);
+#else
+                bitsliced_m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( row * 16 + S[col * n + j] )*m_legs * 2 );
+#endif
+                P3_used ++;
+            }
+        }
+
+        for (int row = 0; row < n; row++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+           multiply_bins_64(accumulator + row * 16 * 4, PS + (row * k + col) * 4);
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+           multiply_bins_96(accumulator + row * 16 * 6, PS + (row * k + col) * 6);
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+           multiply_bins_128(accumulator + row * 16 * 8, PS + (row * k + col) * 8);
+#else
+           bitsliced_m_multiply_bins(m_legs, accumulator + row * 16 * m_legs * 2, PS + (row * k + col) * m_legs * 2);
+#endif
+        }
+    }
+
+    #else
+
+    alignas (32) uint64_t accumulator[M_MAX * K_MAX * N_MAX] = {0};
+    int P1_used = 0;
+    for (int row = 0; row < v; row++) {
+        for (int j = row; j < v; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P1 + P1_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+            vec_add_96(P1 + P1_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+            vec_add_128(P1 + P1_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P1 + P1_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+            }
+#endif
+            P1_used ++;
+        }
+
+
+        for (int j = 0; j < o; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 4 );
+            vec_add_64(P2 + (row * o + j) * 4, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 6 );
+            vec_add_96(P2 + (row * o + j) * 6, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 0) * 16 + S[(0 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 1) * 16 + S[(1 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 2) * 16 + S[(2 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 3) * 16 + S[(3 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 4) * 16 + S[(4 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 5) * 16 + S[(5 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 6) * 16 + S[(6 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 7) * 16 + S[(7 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 8) * 16 + S[(8 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 9) * 16 + S[(9 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 10) * 16 + S[(10 * n) + j + v] ) * 8 );
+            vec_add_128(P2 + (row * o + j) * 8, accumulator + ( (row * k + 11) * 16 + S[(11 * n) + j + v] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P2 + (row * o + j)*m_legs * 2, accumulator + ( (row * k + col) * 16 + S[(col * n) + j + v] )*m_legs * 2 );
+            }
+#endif
+        }
+    }
+
+    int P3_used = 0;
+    for (int row = v; row < n; row++) {
+        for (int j = row; j < n; j++) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 9)
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 64) && (K_MAX == 4)
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 4 );
+            vec_add_64(P3 + P3_used * 4, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 4 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 96) && (K_MAX == 11)
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 6 );
+            vec_add_96(P3 + P3_used * 6, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 6 );
+#elif defined(MAYO_VARIANT) && (M_MAX == 128) && (K_MAX == 12)
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 0) * 16 + S[0 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 1) * 16 + S[1 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 2) * 16 + S[2 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 3) * 16 + S[3 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 4) * 16 + S[4 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 5) * 16 + S[5 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 6) * 16 + S[6 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 7) * 16 + S[7 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 8) * 16 + S[8 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 9) * 16 + S[9 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 10) * 16 + S[10 * n + j] ) * 8 );
+            vec_add_128(P3 + P3_used * 8, accumulator + ( (row * k + 11) * 16 + S[11 * n + j] ) * 8 );
+#else
+            for (int col = 0; col < k; col++) {
+                m_vec_add(m_legs, P3 + P3_used * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[col * n + j] )*m_legs * 2 );
+            }
+#endif
+            P3_used ++;
+        }
+    }
+
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < n * k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+        multiply_bins_64(accumulator + i * 16 * 4, PS + i * 4);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+        multiply_bins_96(accumulator + i * 16 * 6, PS + i * 6);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+        multiply_bins_128(accumulator + i * 16 * 8, PS + i * 8);
+        i++;
+#else
+        m_multiply_bins(m/32, accumulator + i * 16 * (m/32) * 2, PS + i * (m/32) * 2);
+        i++;
+#endif
+    }
+
+    #endif
+}
+
+
+static inline void mayo_generic_m_calculate_SPS(const uint64_t *PS, const unsigned char *S, int m, int k, int  n, uint64_t *SPS){
+    alignas (32) uint64_t accumulator[M_MAX*K_MAX*K_MAX] = {0};
+    #if !defined(MAYO_VARIANT)
+    const int m_legs = m/32;
+    #else
+    (void) m;
+    #endif
+    for (int row = 0; row < k; row++) {
+        for (int j = 0; j < n; j++) {
+            for (int col = 0; col < k; col += 1) {
+                #if defined(MAYO_VARIANT) && (M_MAX == 64)
+                    vec_add_64(PS + (j * k + col) * 4, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 4 );
+                #elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                    vec_add_96(PS + (j * k + col) * 6, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 6 );
+                #elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                    vec_add_128(PS + (j * k + col) * 8, accumulator + ( (row * k + col) * 16 + S[row * n + j] ) * 8 );
+                #else
+                    m_vec_add(m_legs, PS + (j * k + col) * m_legs * 2, accumulator + ( (row * k + col) * 16 + S[row * n + j] )*m_legs * 2 );
+                #endif
+            }
+        }
+    }
+
+    // multiply stuff according to the bins of the accumulator and add to PS.
+    int i = 0;
+    while (i < k*k) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+        multiply_bins_64(accumulator + i * 16 * 4, SPS + i * 4);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+        multiply_bins_96(accumulator + i * 16 * 6, SPS + i * 6);
+        i++;
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+        multiply_bins_128(accumulator + i * 16 * 8, SPS + i * 8);
+        i++;
+#else
+        m_multiply_bins(m_legs, accumulator + i * 16 * m_legs * 2, SPS + i * m_legs * 2);
+        i++;
+#endif
+    }
+}
+
+
+// multiplies m (possibly upper triangular) matrices with a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_cols, int triangular) {
+
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 4 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 6 * (r * mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[c * mat_cols + k], acc + 8 * (r * mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[c * mat_cols + k], acc + m_legs * 2 * (r * mat_cols + k));
+#endif
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+
+// multiplies m (possibly upper triangular) matrices with the transpose of a single matrix and adds result to acc
+static inline void mul_add_m_upper_triangular_mat_x_mat_trans(int m_legs, const uint64_t *bs_mat, const unsigned char *mat, uint64_t *acc, int bs_mat_rows, int bs_mat_cols, int mat_rows, int triangular) {
+
+    int bs_mat_entries_used = 0;
+    for (int r = 0; r < bs_mat_rows; r++) {
+        for (int c = triangular * r; c < bs_mat_cols; c++) {
+            for (int k = 0; k < mat_rows; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 4 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 6 * (r * mat_rows + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + 8 * (r * mat_rows + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * bs_mat_entries_used, mat[k * bs_mat_cols + c], acc + m_legs * 2 * (r * mat_rows + k));
+#endif
+            }
+            bs_mat_entries_used += 1;
+        }
+    }
+}
+
+// multiplies the transpose of a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_trans_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+    for (int r = 0; r < mat_cols; r++) {
+        for (int c = 0; c < mat_rows; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + 8 * (r * bs_mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[c * mat_cols + r], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+            }
+        }
+    }
+}
+
+// multiplies a single matrix with m matrices and adds result to acc
+static inline void mul_add_mat_x_m_mat(int m_legs, const unsigned char *mat, const uint64_t *bs_mat, uint64_t *acc, int mat_rows, int mat_cols, int bs_mat_cols) {
+
+    for (int r = 0; r < mat_rows; r++) {
+        for (int c = 0; c < mat_cols; c++) {
+            for (int k = 0; k < bs_mat_cols; k += 1) {
+#if defined(MAYO_VARIANT) && (M_MAX == 64)
+                (void) m_legs;
+                vec_mul_add_64(bs_mat + 4 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 4 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 96)
+                (void) m_legs;
+                vec_mul_add_96(bs_mat + 6 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 6 * (r * bs_mat_cols + k));
+#elif defined(MAYO_VARIANT) && (M_MAX == 128)
+                (void) m_legs;
+                vec_mul_add_128(bs_mat + 8 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + 8 * (r * bs_mat_cols + k));
+#else
+                m_vec_mul_add(m_legs, bs_mat + m_legs * 2 * (c * bs_mat_cols + k), mat[r * mat_cols + c], acc + m_legs * 2 * (r * bs_mat_cols + k));
+#endif
+            }
+        }
+    }
+}
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/echelon_form.h b/src/sig/mayo/pqmayo_mayo-5_opt/echelon_form.h
new file mode 100644
index 0000000000..82505847c9
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/echelon_form.h
@@ -0,0 +1,152 @@
+
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef ECHELON_FORM_H
+#define ECHELON_FORM_H
+
+#include <stdalign.h>
+#include <stdint.h>
+#include <mem.h>
+#include <arithmetic.h>
+
+#define MAYO_MAX(x, y) (((x) > (y)) ? (x) : (y))
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+
+static inline unsigned char
+m_extract_element(const uint64_t *in, int index) {
+    const int leg = index / 16;
+    const int offset = index % 16;
+
+    return (in[leg] >> (offset*4)) & 0xF;
+}
+
+static inline void
+ef_pack_m_vec(const unsigned char *in, uint64_t *out, int ncols) {
+    int i;
+    unsigned char *out8 = (unsigned char *)out;
+    for(i = 0; i+1 < ncols; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0) | (in[i+1] << 4);
+#else
+        out8[i/2]  = (in[i+0] << 0) | (in[i+1] << 4);
+#endif
+    }
+    if (ncols % 2 == 1){
+#ifdef TARGET_BIG_ENDIAN
+        out8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]  = (in[i+0] << 0);
+#else
+        out8[i/2]  = (in[i+0] << 0);
+#endif
+    }
+}
+
+static inline void
+ef_unpack_m_vec(int legs, const uint64_t *in, unsigned char *out) {
+    const unsigned char *in8 = (const unsigned char *)in;
+    for(int i = 0; i < legs * 16; i += 2){
+#ifdef TARGET_BIG_ENDIAN
+        out[i]   = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8]) & 0xF;
+        out[i+1] = (in8[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        out[i]   = (in8[i/2]) & 0xF;
+        out[i+1] = (in8[i/2] >> 4);
+#endif
+    }
+}
+
+
+// put matrix in row echelon form with ones on first nonzero entries *in
+// constant time*
+static inline void EF(unsigned char *A, int nrows, int ncols) {
+
+    alignas (32) uint64_t _pivot_row[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t _pivot_row2[(K_MAX * O_MAX + 1 + 15) / 16];
+    alignas (32) uint64_t packed_A[((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX] = { 0 };
+
+    int row_len = (ncols + 15) / 16;
+
+    // nibbleslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_pack_m_vec(A + i * ncols, packed_A + i * row_len, ncols);
+    }
+
+    // pivot row is secret, pivot col is not
+
+    unsigned char inverse;
+    int pivot_row = 0;
+    for (int pivot_col = 0; pivot_col < ncols; pivot_col++) {
+
+        int pivot_row_lower_bound = MAYO_MAX(0, pivot_col + nrows - ncols);
+        int pivot_row_upper_bound = MAYO_MIN(nrows - 1, pivot_col);
+        // the pivot row is guaranteed to be between these lower and upper bounds if
+        // A has full rank
+
+        // zero out pivot row
+        for (int i = 0; i < row_len; i++) {
+            _pivot_row[i] = 0;
+            _pivot_row2[i] = 0;
+        }
+
+        // try to get a pivot row in constant time
+        unsigned char pivot = 0;
+        uint64_t pivot_is_zero = -1;
+        for (int row = pivot_row_lower_bound;
+                row <= MAYO_MIN(nrows - 1, pivot_row_upper_bound + 32); row++) {
+
+            uint64_t is_pivot_row = ~ct_compare_64(row, pivot_row);
+            uint64_t below_pivot_row = ct_64_is_greater_than(row, pivot_row);
+
+            for (int j = 0; j < row_len; j++) {
+                _pivot_row[j] ^= (is_pivot_row | (below_pivot_row & pivot_is_zero)) &
+                                 packed_A[row * row_len + j];
+            }
+            pivot = m_extract_element(_pivot_row, pivot_col);
+            pivot_is_zero = ~ct_compare_64((int) pivot, 0);
+        }
+
+        // multiply pivot row by inverse of pivot
+        inverse = inverse_f(pivot);
+        vec_mul_add_u64(row_len, _pivot_row, inverse, _pivot_row2);
+
+        // conditionally write pivot row to the correct row, if there is a nonzero
+        // pivot
+        for (int row = pivot_row_lower_bound; row <= pivot_row_upper_bound; row++) {
+            uint64_t do_copy = ~ct_compare_64(row, pivot_row) & ~pivot_is_zero;
+            uint64_t do_not_copy = ~do_copy;
+            for (int col = 0; col < row_len; col++) {
+                packed_A[row * row_len + col] =
+                    (do_not_copy & packed_A[row * row_len + col]) +
+                    (do_copy & _pivot_row2[col]);
+            }
+        }
+
+        // eliminate entries below pivot
+        for (int row = pivot_row_lower_bound; row < nrows; row++) {
+            unsigned char below_pivot = (row > pivot_row);
+            unsigned char elt_to_elim = m_extract_element(packed_A + row * row_len, pivot_col);
+
+            vec_mul_add_u64(row_len, _pivot_row2, below_pivot * elt_to_elim,
+                                    packed_A + row * row_len);                            
+        }
+
+        pivot_row += (-(int64_t)(~pivot_is_zero));
+    }
+
+    unsigned char temp[(O_MAX * K_MAX + 1 + 15)];
+
+    // unbitslice the matrix A
+    for (int i = 0; i < nrows; i++) {
+        ef_unpack_m_vec(row_len, packed_A + i * row_len, temp);
+        for (int j = 0; j < ncols; j++) {
+            A[i * ncols + j] = temp[j];
+        }
+    }
+
+    mayo_secure_clear(temp, K_MAX * O_MAX + 1 + 15);
+    mayo_secure_clear(_pivot_row, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(_pivot_row2, (K_MAX * O_MAX + 1 + 15) / 16 * 8);
+    mayo_secure_clear(packed_A, ((K_MAX * O_MAX + 1 + 15) / 16) * M_MAX * 8);
+}
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/mayo.c b/src/sig/mayo/pqmayo_mayo-5_opt/mayo.c
new file mode 100644
index 0000000000..ce1ccd4c71
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/mayo.c
@@ -0,0 +1,656 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mem.h>
+#include <mayo.h>
+#include <randombytes.h>
+#include <aes_ctr.h>
+#include <arithmetic.h>
+#include <simple_arithmetic.h>
+#include <fips202.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdalign.h>
+#ifdef ENABLE_CT_TESTING
+#include <valgrind/memcheck.h>
+#endif
+
+#define MAYO_MIN(x, y) (((x) < (y)) ? (x) : (y))
+#define PK_PRF AES_128_CTR
+
+static void decode(const unsigned char *m, unsigned char *mdec, int mdeclen) {
+    int i;
+    for (i = 0; i < mdeclen / 2; ++i) {
+        *mdec++ = m[i] & 0xf;
+        *mdec++ = m[i] >> 4;
+    }
+
+    if (mdeclen % 2 == 1) {
+        *mdec++ = m[i] & 0x0f;
+    }
+}
+
+static void encode(const unsigned char *m, unsigned char *menc, int mlen) {
+    int i;
+    for (i = 0; i < mlen / 2; ++i, m += 2) {
+        menc[i] = (*m) | (*(m + 1) << 4);
+    }
+
+    if (mlen % 2 == 1) {
+        menc[i] = (*m);
+    }
+}
+
+static void compute_rhs(const mayo_params_t *p, const uint64_t *_vPv, const unsigned char *t, unsigned char *y){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+
+    const uint64_t *vPv = _vPv;
+    uint64_t temp[M_MAX/16] = {0};
+    unsigned char *temp_bytes = (unsigned char *) temp;
+    int k = 0;
+    for (int i = PARAM_k(p) - 1; i >= 0 ; i--) {
+        for (int j = i; j < PARAM_k(p); j++) {
+            // multiply by X (shift up 4 bits)
+            unsigned char top = temp[k] >> 60;
+            temp[k] <<= 4;
+            k--;
+            for(; k>=0; k--){
+                temp[k+1] ^= temp[k] >> 60;
+                temp[k] <<= 4;
+            }
+            // reduce mod f(X)
+            for (int jj = 0; jj < F_TAIL_LEN; jj++) {
+                if(jj%2 == 0){
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]);
+#endif
+                }
+                else {
+#ifdef TARGET_BIG_ENDIAN
+                    temp_bytes[(((jj/2 + 8) / 8) * 8) - 1 - (jj/2)%8] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#else
+                    temp_bytes[jj/2] ^= mul_f(top, PARAM_f_tail(p)[jj]) << 4;
+#endif
+                }
+            }
+
+            // extract from vPv and add
+            for(k=0; k < PARAM_m(p)/16; k ++){
+                temp[k] ^= vPv[( i * PARAM_k(p) + j )* (PARAM_m(p) / 16) + k] ^ ((i!=j)*vPv[( j * PARAM_k(p) + i )* (PARAM_m(p) / 16) + k]);
+            }
+            k--;
+        }
+    }
+
+    // add to y
+    for (int i = 0; i < PARAM_m(p); i+=2)
+    {
+#ifdef TARGET_BIG_ENDIAN
+        y[i]   = t[i]   ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[(((i/2 + 8) / 8) * 8) - 1 - (i/2)%8] >> 4);
+#else
+        y[i]   = t[i]   ^ (temp_bytes[i/2] & 0xF);
+        y[i+1] = t[i+1] ^ (temp_bytes[i/2] >> 4);
+#endif
+
+    }
+}
+
+static void transpose_16x16_nibbles(uint64_t *M){
+    static const uint64_t even_nibbles = 0x0f0f0f0f0f0f0f0f;
+    static const uint64_t even_bytes   = 0x00ff00ff00ff00ff;
+    static const uint64_t even_2bytes  = 0x0000ffff0000ffff;
+    static const uint64_t even_half    = 0x00000000ffffffff;
+
+    for (size_t i = 0; i < 16; i+=2)
+    {
+        uint64_t t = ((M[i] >> 4 ) ^ M[i+1]) & even_nibbles; 
+        M[i  ] ^= t << 4;
+        M[i+1] ^= t;
+    }
+
+    for (size_t i = 0; i < 16; i+=4)
+    {
+        uint64_t t0 = ((M[i  ] >> 8) ^ M[i+2]) & even_bytes; 
+        uint64_t t1 = ((M[i+1] >> 8) ^ M[i+3]) & even_bytes; 
+        M[i  ] ^= (t0 << 8);
+        M[i+1] ^= (t1 << 8);
+        M[i+2] ^= t0;
+        M[i+3] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 4; i++)
+    {
+        uint64_t t0 = ((M[i  ] >> 16) ^ M[i+ 4]) & even_2bytes;
+        uint64_t t1 = ((M[i+8] >> 16) ^ M[i+12]) & even_2bytes;
+
+        M[i   ] ^= t0 << 16;
+        M[i+ 8] ^= t1 << 16;
+        M[i+ 4] ^= t0;
+        M[i+12] ^= t1;
+    }
+    
+    for (size_t i = 0; i < 8; i++)
+    {
+        uint64_t t = ((M[i]>>32) ^ M[i+8]) & even_half; 
+        M[i  ] ^= t << 32;
+        M[i+8] ^= t;
+    }
+}
+
+#define MAYO_M_OVER_8 ((M_MAX + 7) / 8)
+
+static void compute_A(const mayo_params_t *p, const uint64_t *_VtL, unsigned char *A_out){
+    #ifndef ENABLE_PARAMS_DYNAMIC
+    (void) p;
+    #endif
+    
+    const uint64_t *VtL = _VtL;
+    int bits_to_shift = 0;
+    int words_to_shift = 0;
+    uint64_t A[(((O_MAX*K_MAX+15)/16)*16)*MAYO_M_OVER_8] = {0};
+    size_t A_width = ((PARAM_o(p)*PARAM_k(p) + 15)/16)*16;
+    const uint64_t *Mi, *Mj;
+
+    for (int i = 0; i <= PARAM_k(p) - 1; ++i) {
+        for (int j = PARAM_k(p) - 1; j >= i; --j) {
+            // add the M_i and M_j to A, shifted "down" by l positions
+            Mj = VtL + j * (PARAM_m(p)/16) * PARAM_o(p);
+            for (int c = 0; c < PARAM_o(p); c++) {
+                for (int k = 0; k < PARAM_m(p)/16; k++)
+                {
+                    A[ PARAM_o(p) * i + c + (k + words_to_shift)*A_width] ^= Mj[k + c*PARAM_m(p)/16] << bits_to_shift;
+                    if(bits_to_shift > 0){
+                        A[ PARAM_o(p) * i + c + (k + words_to_shift + 1)*A_width] ^= Mj[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                    }
+                }
+            }
+
+            if (i != j) {
+                Mi = VtL + i * (PARAM_m(p)/16) * PARAM_o(p);
+                for (int c = 0; c < PARAM_o(p); c++) {
+                    for (int k = 0; k < PARAM_m(p)/16; k++)
+                    {
+                        A[PARAM_o(p) * j + c + (k + words_to_shift)*A_width] ^= Mi[k + c*PARAM_m(p)/16] << bits_to_shift;
+                        if(bits_to_shift > 0){
+                            A[PARAM_o(p) * j + c + (k + words_to_shift + 1)*A_width] ^= Mi[k + c*PARAM_m(p)/16] >> (64-bits_to_shift);
+                        }
+                    }
+                }
+            }
+
+            bits_to_shift += 4;
+            if(bits_to_shift == 64){
+                words_to_shift ++;
+                bits_to_shift = 0;
+            }
+        }
+    }
+
+    for (size_t c = 0; c < A_width*((PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 +15)/16) ; c+= 16)
+    {
+        transpose_16x16_nibbles(A + c);
+    }
+
+    unsigned char tab[F_TAIL_LEN*4] = {0};
+    for (size_t i = 0; i < F_TAIL_LEN; i++)
+    {
+        tab[4*i]   = mul_f(PARAM_f_tail(p)[i],1);
+        tab[4*i+1] = mul_f(PARAM_f_tail(p)[i],2);
+        tab[4*i+2] = mul_f(PARAM_f_tail(p)[i],4);
+        tab[4*i+3] = mul_f(PARAM_f_tail(p)[i],8);
+    }
+
+    uint64_t low_bit_in_nibble = 0x1111111111111111;
+    
+    for (size_t c = 0; c < A_width; c+= 16)
+    {
+        for (int r = PARAM_m(p); r < PARAM_m(p) + (PARAM_k(p)+1)*PARAM_k(p)/2 ; r++)
+        {
+            size_t pos = (r/16)*A_width + c + (r%16);
+            uint64_t t0 =  A[pos]       & low_bit_in_nibble;
+            uint64_t t1 = (A[pos] >> 1) & low_bit_in_nibble;
+            uint64_t t2 = (A[pos] >> 2) & low_bit_in_nibble;
+            uint64_t t3 = (A[pos] >> 3) & low_bit_in_nibble;
+            for (size_t t = 0; t < F_TAIL_LEN; t++)
+            {
+                A[((r+t-PARAM_m(p))/16)*A_width + c + ((r+t)%16)] ^= t0*tab[4*t+0] ^ t1*tab[4*t+1] ^ t2*tab[4*t+2] ^ t3*tab[4*t+3];
+            }
+        }
+    }
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (((PARAM_o(p)*PARAM_k(p)+15)/16)*16)*MAYO_M_OVER_8; ++i) 
+        A[i] = BSWAP64(A[i]);
+#endif
+
+    for (int r = 0; r < PARAM_m(p); r+=16)
+    {
+        for (int c = 0; c < PARAM_A_cols(p)-1 ; c+=16)
+        {
+            for (size_t i = 0; i < 16; i++)
+            {
+                decode( (unsigned char *) &A[r*A_width/16 + c + i], A_out + PARAM_A_cols(p)*(r+i) + c, MAYO_MIN(16, PARAM_A_cols(p)-1-c));
+            }   
+        }
+    }
+}
+
+// Public API
+
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk) {
+    int ret = 0;
+
+    ret = mayo_keypair_compact(p, pk, sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+err:
+    return ret;
+}
+
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char tenc[M_BYTES_MAX], t[M_MAX]; // no secret data
+    unsigned char y[M_MAX];                    // secret data
+    unsigned char salt[SALT_BYTES_MAX];        // not secret data
+    unsigned char V[K_MAX * V_BYTES_MAX + R_BYTES_MAX],
+             Vdec[N_MINUS_O_MAX * K_MAX];                 // secret data
+    unsigned char A[M_MAX * (K_MAX * O_MAX + 1)];   // secret data
+    unsigned char x[K_MAX * N_MAX];                       // not secret data
+    unsigned char r[K_MAX * O_MAX + 1] = { 0 };           // secret data
+    unsigned char s[K_MAX * N_MAX];                       // not secret data
+    const unsigned char *seed_sk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX]; // secret data
+    alignas(32) sk_t sk;                    // secret data
+    unsigned char Ox[N_MINUS_O_MAX];        // secret data
+    // unsigned char Mdigest[DIGEST_BYTES];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1];
+    unsigned char *ctrbyte;
+    unsigned char *vi;
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_v_bytes = PARAM_v_bytes(p);
+    const int param_r_bytes = PARAM_r_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_A_cols = PARAM_A_cols(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    ret = mayo_expand_sk(p, csk, &sk);
+    if (ret != MAYO_OK) {
+        goto err;
+    }
+
+    seed_sk = csk;
+    decode(sk.o, O, (param_n - param_o) * param_o);
+
+    // hash message
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    uint64_t *P1 = sk.p;
+    uint64_t *L  = P1 + (param_P1_bytes/8);
+    alignas (32) uint64_t Mtmp[K_MAX * O_MAX * M_MAX / 16] = {0};
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        L[i] = BSWAP64(L[i]);
+    }
+#endif
+
+    // choose the randomizer
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(tmp + param_digest_bytes, param_salt_bytes);
+    #else
+    if (randombytes(tmp + param_digest_bytes, param_salt_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // hashing to salt
+    memcpy(tmp + param_digest_bytes + param_salt_bytes, seed_sk,
+           param_sk_seed_bytes);
+    shake256(salt, param_salt_bytes, tmp,
+             param_digest_bytes + param_salt_bytes + param_sk_seed_bytes);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(salt, SALT_BYTES_MAX); // Salt is not secret
+#endif
+
+    // hashing to t
+    memcpy(tmp + param_digest_bytes, salt, param_salt_bytes);
+    ctrbyte = tmp + param_digest_bytes + param_salt_bytes + param_sk_seed_bytes;
+
+    shake256(tenc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+
+    decode(tenc, t, param_m); // may not be necessary
+
+    for (int ctr = 0; ctr <= 255; ++ctr) {
+        *ctrbyte = (unsigned char)ctr;
+
+        shake256(V, param_k * param_v_bytes + param_r_bytes, tmp,
+                 param_digest_bytes + param_salt_bytes + param_sk_seed_bytes + 1);
+
+        // decode the v_i vectors
+        for (int i = 0; i <= param_k - 1; ++i) {
+            decode(V + i * param_v_bytes, Vdec + i * (param_n - param_o),
+                   param_n - param_o);
+        }
+
+        // compute all the V * L matrices.
+        // compute all the V * P1 * V^T matrices.
+        alignas (32) uint64_t Y[K_MAX * K_MAX * M_MAX / 16] = {0};
+        V_times_L__V_times_P1_times_Vt(p, L, Vdec, Mtmp, P1, Y);
+
+        compute_rhs(p, Y, t, y);
+        compute_A(p, Mtmp, A);
+
+        decode(V + param_k * param_v_bytes, r,
+               param_k *
+               param_o);
+        if (sample_solution(p, A, y, r, x, param_k, param_o, param_m, param_A_cols)) {
+            break;
+        } else {
+            memset(Mtmp, 0, param_k * param_o * param_m / 16 * sizeof(uint64_t));
+        }
+    }
+
+    // s is already 0
+    // TODO: optimize this?
+    for (int i = 0; i <= param_k - 1; ++i) {
+        vi = Vdec + i * (param_n - param_o);
+        mat_mul(O, x + i * param_o, Ox, param_o, param_n - param_o, 1);
+        mat_add(vi, Ox, s + i * param_n, param_n - param_o, 1);
+        memcpy(s + i * param_n + (param_n - param_o), x + i * param_o, param_o);
+    }
+    encode(s, sig, param_n * param_k);
+    memcpy(sig + param_sig_bytes - param_salt_bytes, salt, param_salt_bytes);
+    *siglen = param_sig_bytes;
+err:
+    mayo_secure_clear(V, K_MAX * V_BYTES_MAX + R_BYTES_MAX);
+    mayo_secure_clear(Vdec, N_MINUS_O_MAX * K_MAX);
+    mayo_secure_clear(A, M_MAX * (K_MAX * O_MAX + 1));
+    mayo_secure_clear(r, K_MAX * O_MAX + 1);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(&sk, sizeof(sk_t));
+    mayo_secure_clear(Ox, N_MINUS_O_MAX);
+    mayo_secure_clear(tmp,
+                      DIGEST_BYTES_MAX + SALT_BYTES_MAX + SK_SEED_BYTES_MAX + 1);
+    return ret;
+}
+
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk) {
+    int ret = MAYO_OK;
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    size_t siglen = param_sig_bytes;
+    ret = mayo_sign_signature(p, sm, &siglen, m, mlen, csk);
+    if (ret != MAYO_OK || siglen != (size_t) param_sig_bytes)
+        goto err;
+
+    memmove(sm + param_sig_bytes, m, mlen);
+    *smlen = siglen + mlen;
+err:
+    return ret;
+}
+
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk) {
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    if (smlen < (size_t)param_sig_bytes) {
+        return MAYO_ERR;
+    }
+    int result = mayo_verify(p, sm + param_sig_bytes, smlen - param_sig_bytes, sm,
+                             pk);
+
+    if (result == MAYO_OK) {
+        *mlen = smlen - param_sig_bytes;
+        memmove(m, sm + param_sig_bytes, *mlen);
+    }
+
+    return result;
+}
+
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk) {
+    int ret = MAYO_OK;
+    unsigned char *seed_sk = csk;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    alignas (32) uint64_t P[(P1_BYTES_MAX + P2_BYTES_MAX) / 8];
+    alignas (32) uint64_t P3[O_MAX * O_MAX * M_MAX / 16] = {0};
+
+    unsigned char *seed_pk;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    // seed_sk $←- B^(sk_seed bytes)
+    #if defined(PQM4) || defined(HAVE_RANDOMBYTES_NORETVAL)
+    randombytes(seed_sk, param_sk_seed_bytes);
+    #else
+    if (randombytes(seed_sk, param_sk_seed_bytes) != MAYO_OK) {
+        ret = MAYO_ERR;
+        goto err;
+    }
+    #endif
+
+    // S ← shake256(seedsk, pk seed bytes + O bytes)
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    // seed_pk ← s[0 : pk_seed_bytes]
+    seed_pk = S;
+
+    // o ← Decode_o(s[pk_seed_bytes : pk_seed_bytes + o_bytes])
+    decode(S + param_pk_seed_bytes, O, param_v * param_o);
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1, P2, and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+
+    int m_legs = param_m / 32;
+
+    uint64_t *P1 = P;
+    uint64_t *P1O_P2 = P + (param_P1_bytes / 8);
+
+    // compute P3 = O^t * (P1*O + P2)
+    Ot_times_P1O_P2(p, P1, O, P1O_P2, P3);
+
+    // store seed_pk in cpk
+    memcpy(cpk, seed_pk, param_pk_seed_bytes);
+
+    alignas (32) uint64_t P3_upper[P3_BYTES_MAX / 8];
+
+    // compute Upper(P3) and store in cpk
+    m_upper(m_legs, P3, P3_upper, param_o);
+    
+    memcpy(cpk + param_pk_seed_bytes, P3_upper, param_P3_bytes);
+
+
+#if !defined(PQM4) && !defined(HAVE_RANDOMBYTES_NORETVAL)
+err:
+#endif
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    mayo_secure_clear(P3, O_MAX * O_MAX * M_MAX / 2);
+    return ret;
+}
+
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *pk) {
+    #ifdef MAYO_VARIANT
+    (void)p;
+    #endif
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    PK_PRF(pk, param_P1_bytes + param_P2_bytes, cpk, param_pk_seed_bytes);
+    pk += param_P1_bytes + param_P2_bytes;
+    memmove(pk, cpk + param_pk_seed_bytes, param_P3_bytes);
+    return MAYO_OK;
+}
+
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *sk) {
+    int ret = MAYO_OK;
+    unsigned char S[PK_SEED_BYTES_MAX + O_BYTES_MAX];
+    uint64_t *P = sk->p;
+    unsigned char O[(N_MINUS_O_MAX)*O_MAX];
+
+    const int param_o = PARAM_o(p);
+    const int param_v = PARAM_v(p);
+    const int param_O_bytes = PARAM_O_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+    const int param_pk_seed_bytes = PARAM_pk_seed_bytes(p);
+    const int param_sk_seed_bytes = PARAM_sk_seed_bytes(p);
+
+    const unsigned char *seed_sk = csk;
+    unsigned char *seed_pk = S;
+
+    shake256(S, param_pk_seed_bytes + param_O_bytes, seed_sk,
+             param_sk_seed_bytes);
+    decode(S + param_pk_seed_bytes, O,
+           param_v * param_o); // O = S + PK_SEED_BYTES;
+
+#ifdef ENABLE_CT_TESTING
+    VALGRIND_MAKE_MEM_DEFINED(seed_pk, param_pk_seed_bytes);
+#endif
+
+    // encode decode not necessary, since P1,P2 and P3 are sampled and stored in correct format
+    PK_PRF((unsigned char *)P, param_P1_bytes + param_P2_bytes, seed_pk,
+           param_pk_seed_bytes);
+
+    uint64_t *P2 = P + (param_P1_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+    
+    uint64_t *P1 = P;
+    // compute L_i = (P1 + P1^t)*O + P2
+    uint64_t *L = P2;
+    P1P1t_times_O(p, P1, O, L);
+
+    // write to sk
+    memcpy(sk->o, S + param_pk_seed_bytes, param_O_bytes);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < (param_P1_bytes + param_P2_bytes) / 8; ++i) {
+        P[i] = BSWAP64(P[i]);
+    }
+#endif
+
+    mayo_secure_clear(S, PK_SEED_BYTES_MAX + O_BYTES_MAX);
+    mayo_secure_clear(O, (N_MINUS_O_MAX)*O_MAX);
+    return ret;
+}
+
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *cpk) {
+    unsigned char tEnc[M_BYTES_MAX];
+    unsigned char t[M_MAX];
+    unsigned char y[2 * M_MAX] = {0}; // extra space for reduction mod f(X)
+    unsigned char s[K_MAX * N_MAX];
+    alignas (64) uint64_t pk[EPK_BYTES_MAX / 8];
+    unsigned char tmp[DIGEST_BYTES_MAX + SALT_BYTES_MAX];
+
+    const int param_m = PARAM_m(p);
+    const int param_n = PARAM_n(p);
+    const int param_v = PARAM_v(p);
+    const int param_o = PARAM_o(p);
+    const int param_k = PARAM_k(p);
+    const int param_m_bytes = PARAM_m_bytes(p);
+    const int param_P1_bytes = PARAM_P1_bytes(p);
+    const int param_P2_bytes = PARAM_P2_bytes(p);
+#ifdef TARGET_BIG_ENDIAN
+    const int param_P3_bytes = PARAM_P3_bytes(p);
+#endif
+    const int param_sig_bytes = PARAM_sig_bytes(p);
+    const int param_digest_bytes = PARAM_digest_bytes(p);
+    const int param_salt_bytes = PARAM_salt_bytes(p);
+
+    int ret = mayo_expand_pk(p, cpk, (unsigned char *)pk);
+    if (ret != MAYO_OK) {
+        return MAYO_ERR;
+    }
+
+    uint64_t *P1 = pk;
+    uint64_t *P2 = pk + (param_P1_bytes / 8);
+    uint64_t *P3 = P2 + (param_P2_bytes / 8);
+
+#ifdef TARGET_BIG_ENDIAN
+    for (int i = 0; i < param_P1_bytes / 8; ++i) {
+        P1[i] = BSWAP64(P1[i]);
+    }
+    for (int i = 0; i < param_P2_bytes / 8; ++i) {
+        P2[i] = BSWAP64(P2[i]);
+    }
+    for (int i = 0; i < param_P3_bytes / 8; ++i) {
+        P3[i] = BSWAP64(P3[i]);
+    }
+#endif
+
+    // hash m
+    shake256(tmp, param_digest_bytes, m, mlen);
+
+    // compute t
+    memcpy(tmp + param_digest_bytes, sig + param_sig_bytes - param_salt_bytes,
+           param_salt_bytes);
+    shake256(tEnc, param_m_bytes, tmp, param_digest_bytes + param_salt_bytes);
+    decode(tEnc, t, param_m);
+
+    // decode s
+    decode(sig, s, param_k * param_n);
+
+    // Compute S*P*S^T
+    alignas (32) uint64_t SPS[K_MAX * K_MAX * M_MAX / 16] = {0};
+
+    m_calculate_PS_SPS(P1, P2, P3, s, param_m,
+                             param_v, param_o, param_k, SPS);
+
+    // combine the vectors in SPS and reduce mod f(X)
+    compute_rhs(p, SPS, y, y);
+    
+    if (memcmp(y, t, param_m) == 0) {
+        return MAYO_OK; // good signature
+    }
+    return MAYO_ERR; // bad signature
+}
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/mayo.h b/src/sig/mayo/pqmayo_mayo-5_opt/mayo.h
new file mode 100644
index 0000000000..1de4bf2a33
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/mayo.h
@@ -0,0 +1,435 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MAYO_H
+#define MAYO_H
+
+#include <stdint.h>
+#include <stdlib.h>
+
+#define F_TAIL_LEN 5
+#define F_TAIL_64                                                              \
+  { 8, 0, 2, 8, 0 } // f(z) =  z^64         + x^3*z^3 + x*z^2         + x^3
+#define F_TAIL_96                                                              \
+  { 2, 2, 0, 2, 0 } // f(z) =  z^96         +   x*z^3         + x*z   + x
+#define F_TAIL_128                                                             \
+  { 4, 8, 0, 4, 2 } // f(z) = z^128 + x*z^4 + x^2*z^3         + x^3*z + x^2
+
+#define MAYO_1_name "MAYO_1"
+#define MAYO_1_n 66
+#define MAYO_1_m 64
+#define MAYO_1_o 8
+#define MAYO_1_v (MAYO_1_n - MAYO_1_o)
+#define MAYO_1_A_cols (MAYO_1_k * MAYO_1_o + 1)
+#define MAYO_1_k 9
+#define MAYO_1_q 16
+#define MAYO_1_m_bytes 32
+#define MAYO_1_O_bytes 232
+#define MAYO_1_v_bytes 29
+#define MAYO_1_r_bytes 36
+#define MAYO_1_P1_bytes 54752
+#define MAYO_1_P2_bytes 14848
+#define MAYO_1_P3_bytes 1152
+#define MAYO_1_csk_bytes 24
+#define MAYO_1_esk_bytes 69856
+#define MAYO_1_cpk_bytes 1168
+#define MAYO_1_epk_bytes 70752
+#define MAYO_1_sig_bytes 321
+#define MAYO_1_f_tail F_TAIL_64
+#define MAYO_1_f_tail_arr f_tail_64
+#define MAYO_1_salt_bytes 24
+#define MAYO_1_digest_bytes 32
+#define MAYO_1_pk_seed_bytes 16
+#define MAYO_1_sk_seed_bytes 24
+
+#define MAYO_2_name "MAYO_2"
+#define MAYO_2_n 78
+#define MAYO_2_m 64
+#define MAYO_2_o 18
+#define MAYO_2_v (MAYO_2_n - MAYO_2_o)
+#define MAYO_2_A_cols (MAYO_2_k * MAYO_2_o + 1)
+#define MAYO_2_k 4
+#define MAYO_2_q 16
+#define MAYO_2_m_bytes 32
+#define MAYO_2_O_bytes 540
+#define MAYO_2_v_bytes 30
+#define MAYO_2_r_bytes 36
+#define MAYO_2_P1_bytes 58560
+#define MAYO_2_P2_bytes 34560
+#define MAYO_2_P3_bytes 5472
+#define MAYO_2_csk_bytes 24
+#define MAYO_2_esk_bytes 93684
+#define MAYO_2_cpk_bytes 5488
+#define MAYO_2_epk_bytes 98592
+#define MAYO_2_sig_bytes 180
+#define MAYO_2_f_tail F_TAIL_64
+#define MAYO_2_f_tail_arr f_tail_64
+#define MAYO_2_salt_bytes 24
+#define MAYO_2_digest_bytes 32
+#define MAYO_2_pk_seed_bytes 16
+#define MAYO_2_sk_seed_bytes 24
+
+#define MAYO_3_name "MAYO_3"
+#define MAYO_3_n 99
+#define MAYO_3_m 96
+#define MAYO_3_o 10
+#define MAYO_3_v (MAYO_3_n - MAYO_3_o)
+#define MAYO_3_A_cols (MAYO_3_k * MAYO_3_o + 1)
+#define MAYO_3_k 11
+#define MAYO_3_q 16
+#define MAYO_3_m_bytes 48
+#define MAYO_3_O_bytes 445
+#define MAYO_3_v_bytes 45
+#define MAYO_3_r_bytes 55
+#define MAYO_3_P1_bytes 192240
+#define MAYO_3_P2_bytes 42720
+#define MAYO_3_P3_bytes 2640
+#define MAYO_3_csk_bytes 32
+#define MAYO_3_esk_bytes 235437
+#define MAYO_3_cpk_bytes 2656
+#define MAYO_3_epk_bytes 237600
+#define MAYO_3_sig_bytes 577
+#define MAYO_3_f_tail F_TAIL_96
+#define MAYO_3_f_tail_arr f_tail_96
+#define MAYO_3_salt_bytes 32
+#define MAYO_3_digest_bytes 48
+#define MAYO_3_pk_seed_bytes 16
+#define MAYO_3_sk_seed_bytes 32
+
+#define MAYO_5_name "MAYO_5"
+#define MAYO_5_n 133
+#define MAYO_5_m 128
+#define MAYO_5_o 12
+#define MAYO_5_v (MAYO_5_n - MAYO_5_o)
+#define MAYO_5_A_cols (MAYO_5_k * MAYO_5_o + 1)
+#define MAYO_5_k 12
+#define MAYO_5_q 16
+#define MAYO_5_m_bytes 64
+#define MAYO_5_O_bytes 726
+#define MAYO_5_v_bytes 61
+#define MAYO_5_r_bytes 72
+#define MAYO_5_P1_bytes 472384
+#define MAYO_5_P2_bytes 92928
+#define MAYO_5_P3_bytes 4992
+#define MAYO_5_csk_bytes 40
+#define MAYO_5_esk_bytes 566078
+#define MAYO_5_cpk_bytes 5008
+#define MAYO_5_epk_bytes 570304
+#define MAYO_5_sig_bytes 838
+#define MAYO_5_f_tail F_TAIL_128
+#define MAYO_5_f_tail_arr f_tail_128
+#define MAYO_5_salt_bytes 40
+#define MAYO_5_digest_bytes 64
+#define MAYO_5_pk_seed_bytes 16
+#define MAYO_5_sk_seed_bytes 40
+
+#define PARAM_JOIN2_(a, b) a##_##b
+#define PARAM_JOIN2(a, b) PARAM_JOIN2_(a, b)
+#define PARAM_NAME(end) PARAM_JOIN2(MAYO_VARIANT, end)
+
+#if defined(MAYO_VARIANT)
+#define PARAM_JOIN3_(a, b, c) pqmayo_##a##_##b##_##c
+#define PARAM_JOIN3(a, b, c) PARAM_JOIN3_(a, b, c)
+#define PARAM_NAME3(end, s) PARAM_JOIN3(MAYO_VARIANT, end, s)
+
+#if defined(MAYO_BUILD_TYPE_REF)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(ref, s)
+#elif defined(MAYO_BUILD_TYPE_OPT)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(opt, s)
+#elif defined(MAYO_BUILD_TYPE_AVX2)
+#define MAYO_NAMESPACE(s) PARAM_NAME3(avx2, s)
+#else
+#error "Build type not known"
+#endif
+
+#else
+#define MAYO_NAMESPACE(s) s
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define NAME_MAX mayo5
+#define N_MAX 133
+#define M_MAX 128
+#define O_MAX 18
+#define V_MAX 121
+#define K_MAX 12
+#define Q_MAX 16
+#define PK_SEED_BYTES_MAX 16
+#define SK_SEED_BYTES_MAX 40
+#define SALT_BYTES_MAX 40
+#define DIGEST_BYTES_MAX 64
+#define O_BYTES_MAX 726
+#define V_BYTES_MAX 61
+#define R_BYTES_MAX 72
+#define P1_BYTES_MAX 472384
+#define P2_BYTES_MAX 92928
+#define P3_BYTES_MAX 5472
+#define SIG_BYTES_MAX 838
+#define CPK_BYTES_MAX 5488
+#define CSK_BYTES_MAX 40
+#define ESK_BYTES_MAX 566078
+#define EPK_BYTES_MAX 570304
+#define N_MINUS_O_MAX 121
+#define M_BYTES_MAX 64
+#elif defined(MAYO_VARIANT)
+#define M_MAX PARAM_NAME(m)
+#define N_MAX PARAM_NAME(n)
+#define O_MAX PARAM_NAME(o)
+#define V_MAX PARAM_NAME(v)
+#define K_MAX PARAM_NAME(k)
+#define Q_MAX PARAM_NAME(q)
+#define M_BYTES_MAX PARAM_NAME(m_bytes)
+#define O_BYTES_MAX PARAM_NAME(O_bytes)
+#define V_BYTES_MAX PARAM_NAME(v_bytes)
+#define R_BYTES_MAX PARAM_NAME(r_bytes)
+#define P1_BYTES_MAX PARAM_NAME(P1_bytes)
+#define P2_BYTES_MAX PARAM_NAME(P2_bytes)
+#define P3_BYTES_MAX PARAM_NAME(P3_bytes)
+#define SIG_BYTES_MAX PARAM_NAME(sig_bytes)
+#define CSK_BYTES_MAX PARAM_NAME(csk_bytes)
+#define ESK_BYTES_MAX PARAM_NAME(esk_bytes)
+#define CPK_BYTES_MAX PARAM_NAME(cpk_bytes)
+#define EPK_BYTES_MAX PARAM_NAME(epk_bytes)
+#define N_MINUS_O_MAX (N_MAX - O_MAX)
+#define SALT_BYTES_MAX PARAM_NAME(salt_bytes)
+#define DIGEST_BYTES_MAX PARAM_NAME(digest_bytes)
+#define PK_SEED_BYTES_MAX PARAM_NAME(pk_seed_bytes)
+#define SK_SEED_BYTES_MAX SALT_BYTES_MAX
+#else
+#error "Parameter not specified"
+#endif
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+#define PARAM_name(p) (p->name)
+#define PARAM_m(p) (p->m)
+#define PARAM_n(p) (p->n)
+#define PARAM_o(p) (p->o)
+#define PARAM_v(p) (p->n - p->o)
+#define PARAM_A_cols(p) (p->k * p->o + 1)
+#define PARAM_k(p) (p->k)
+#define PARAM_q(p) (p->q)
+#define PARAM_m_bytes(p) (p->m_bytes)
+#define PARAM_O_bytes(p) (p->O_bytes)
+#define PARAM_v_bytes(p) (p->v_bytes)
+#define PARAM_r_bytes(p) (p->r_bytes)
+#define PARAM_P1_bytes(p) (p->P1_bytes)
+#define PARAM_P2_bytes(p) (p->P2_bytes)
+#define PARAM_P3_bytes(p) (p->P3_bytes)
+#define PARAM_csk_bytes(p) (p->csk_bytes)
+#define PARAM_esk_bytes(p) (p->esk_bytes)
+#define PARAM_cpk_bytes(p) (p->cpk_bytes)
+#define PARAM_epk_bytes(p) (p->epk_bytes)
+#define PARAM_sig_bytes(p) (p->sig_bytes)
+#define PARAM_f_tail(p) (p->f_tail)
+#define PARAM_salt_bytes(p) (p->salt_bytes)
+#define PARAM_sk_seed_bytes(p) (p->sk_seed_bytes)
+#define PARAM_digest_bytes(p) (p->digest_bytes)
+#define PARAM_pk_seed_bytes(p) (p->pk_seed_bytes)
+#elif defined(MAYO_VARIANT)
+#define PARAM_name(p) PARAM_NAME(name)
+#define PARAM_m(p) PARAM_NAME(m)
+#define PARAM_n(p) PARAM_NAME(n)
+#define PARAM_o(p) PARAM_NAME(o)
+#define PARAM_v(p) PARAM_NAME(v)
+#define PARAM_A_cols(p) PARAM_NAME(A_cols)
+#define PARAM_k(p) PARAM_NAME(k)
+#define PARAM_q(p) PARAM_NAME(q)
+#define PARAM_m_bytes(p) PARAM_NAME(m_bytes)
+#define PARAM_O_bytes(p) PARAM_NAME(O_bytes)
+#define PARAM_v_bytes(p) PARAM_NAME(v_bytes)
+#define PARAM_r_bytes(p) PARAM_NAME(r_bytes)
+#define PARAM_P1_bytes(p) PARAM_NAME(P1_bytes)
+#define PARAM_P2_bytes(p) PARAM_NAME(P2_bytes)
+#define PARAM_P3_bytes(p) PARAM_NAME(P3_bytes)
+#define PARAM_csk_bytes(p) PARAM_NAME(csk_bytes)
+#define PARAM_esk_bytes(p) PARAM_NAME(esk_bytes)
+#define PARAM_cpk_bytes(p) PARAM_NAME(cpk_bytes)
+#define PARAM_epk_bytes(p) PARAM_NAME(epk_bytes)
+#define PARAM_sig_bytes(p) PARAM_NAME(sig_bytes)
+static const unsigned char f_tail[] = PARAM_NAME(f_tail);
+#define PARAM_salt_bytes(p) PARAM_NAME(salt_bytes)
+#define PARAM_sk_seed_bytes(p) PARAM_NAME(sk_seed_bytes)
+#define PARAM_digest_bytes(p) PARAM_NAME(digest_bytes)
+#define PARAM_pk_seed_bytes(p) PARAM_NAME(pk_seed_bytes)
+#define PARAM_f_tail(p) f_tail
+#else
+#error "Parameter not specified"
+#endif
+
+/**
+ * Struct defining MAYO parameters
+ */
+typedef struct {
+    int m;
+    int n;
+    int o;
+    int k;
+    int q;
+    const unsigned char *f_tail;
+    int m_bytes;
+    int O_bytes;
+    int v_bytes;
+    int r_bytes;
+    int R_bytes;
+    int P1_bytes;
+    int P2_bytes;
+    int P3_bytes;
+    int csk_bytes;
+    int esk_bytes;
+    int cpk_bytes;
+    int epk_bytes;
+    int sig_bytes;
+    int salt_bytes;
+    int sk_seed_bytes;
+    int digest_bytes;
+    int pk_seed_bytes;
+    const char *name;
+} mayo_params_t;
+
+typedef struct sk_t {
+    uint64_t p[P1_BYTES_MAX/8 + P2_BYTES_MAX/8];
+    uint8_t o[O_BYTES_MAX];
+} sk_t;
+
+/**
+ * MAYO parameter sets
+ */
+#ifdef ENABLE_PARAMS_DYNAMIC
+extern const mayo_params_t MAYO_1;
+extern const mayo_params_t MAYO_2;
+extern const mayo_params_t MAYO_3;
+extern const mayo_params_t MAYO_5;
+#endif
+
+/**
+ * Status codes
+ */
+#define MAYO_OK 0
+#define MAYO_ERR 1
+
+/**
+ * Mayo keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] pk Mayo public key
+ * @param[out] sk Mayo secret key
+ * @return int status code
+ */
+#define mayo_keypair MAYO_NAMESPACE(mayo_keypair)
+int mayo_keypair(const mayo_params_t *p, unsigned char *pk, unsigned char *sk);
+
+#define mayo_sign_signature MAYO_NAMESPACE(mayo_sign_signature)
+int mayo_sign_signature(const mayo_params_t *p, unsigned char *sig,
+              size_t *siglen, const unsigned char *m,
+              size_t mlen, const unsigned char *csk);
+
+/**
+ * MAYO signature generation.
+ *
+ * The implementation performs Mayo.expandSK() + Mayo.sign() in the Mayo spec.
+ * Keys provided is a compacted secret keys.
+ * The caller is responsible to allocate sufficient memory to hold sm.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] sm Signature concatenated with message
+ * @param[out] smlen Pointer to the length of sm
+ * @param[in] m Message to be signed
+ * @param[in] mlen Message length
+ * @param[in] sk Compacted secret key
+ * @return int status code
+ */
+#define mayo_sign MAYO_NAMESPACE(mayo_sign)
+int mayo_sign(const mayo_params_t *p, unsigned char *sm,
+              size_t *smlen, const unsigned char *m,
+              size_t mlen, const unsigned char *sk);
+
+/**
+ * Mayo open signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, the original message is stored in m.
+ * Keys provided is a compact public key.
+ * The caller is responsible to allocate sufficient memory to hold m.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sm Signature concatenated with message
+ * @param[in] smlen Length of sm
+ * @param[in] pk Compacted public key
+ * @return int status code
+ */
+#define mayo_open MAYO_NAMESPACE(mayo_open)
+int mayo_open(const mayo_params_t *p, unsigned char *m,
+              size_t *mlen, const unsigned char *sm,
+              size_t smlen, const unsigned char *pk);
+
+/**
+ * Mayo compact keypair generation.
+ *
+ * The implementation corresponds to Mayo.CompactKeyGen() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold pk and sk.
+ * 
+ * outputs a pair (csk, cpk) \in B^{csk_bytes} x B^{cpk_bytes}, where csk and
+ * cpk are compact representations of a Mayo secret key and public key
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] cpk Mayo compacted public key
+ * @param[out] csk Mayo compacted secret key
+ * @return int status code
+ */
+#define mayo_keypair_compact MAYO_NAMESPACE(mayo_keypair_compact)
+int mayo_keypair_compact(const mayo_params_t *p, unsigned char *cpk,
+                         unsigned char *csk);
+
+/**
+ * Mayo expand public key.
+ *
+ * The implementation corresponds to Mayo.expandPK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold epk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] cpk Compacted public key.
+ * @param[out] epk Expanded public key.
+ * @return int return code
+ */
+#define mayo_expand_pk MAYO_NAMESPACE(mayo_expand_pk)
+int mayo_expand_pk(const mayo_params_t *p, const unsigned char *cpk,
+                   unsigned char *epk);
+
+/**
+ * Mayo expand secret key.
+ *
+ * The implementation corresponds to Mayo.expandSK() in the Mayo spec.
+ * The caller is responsible to allocate sufficient memory to hold esk.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[in] csk Compacted secret key.
+ * @param[out] esk Expanded secret key.
+ * @return int return code
+ */
+#define mayo_expand_sk MAYO_NAMESPACE(mayo_expand_sk)
+int mayo_expand_sk(const mayo_params_t *p, const unsigned char *csk,
+                   sk_t *esk);
+
+/**
+ * Mayo verify signature.
+ *
+ * The implementation performs Mayo.verify(). If the signature verification succeeded, returns 0, otherwise 1.
+ * Keys provided is a compact public key.
+ *
+ * @param[in] p Mayo parameter set
+ * @param[out] m Message stored if verification succeeds
+ * @param[out] mlen Pointer to the length of m
+ * @param[in] sig Signature
+ * @param[in] pk Compacted public key
+ * @return int 0 if verification succeeded, 1 otherwise.
+ */
+#define mayo_verify MAYO_NAMESPACE(mayo_verify)
+int mayo_verify(const mayo_params_t *p, const unsigned char *m,
+                size_t mlen, const unsigned char *sig,
+                const unsigned char *pk);
+
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/mem.h b/src/sig/mayo/pqmayo_mayo-5_opt/mem.h
new file mode 100644
index 0000000000..2aa2196e2e
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/mem.h
@@ -0,0 +1,66 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef MEM_H
+#define MEM_H
+#include <stddef.h>
+#include <stdint.h>
+
+#if defined(__GNUC__) || defined(__clang__)
+#define BSWAP32(i) __builtin_bswap32((i))
+#define BSWAP64(i) __builtin_bswap64((i))
+#else
+#define BSWAP32(i) ((((i) >> 24) & 0xff) | (((i) >> 8) & 0xff00) | (((i) & 0xff00) << 8) | ((i) << 24))
+#define BSWAP64(i) ((BSWAP32((i) >> 32) & 0xffffffff) | (BSWAP32(i) << 32))
+#endif
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint32_t ct_is_greater_than(int a, int b) {
+    int32_t diff = b - a;
+    return (uint32_t) (diff >> (8*sizeof(uint32_t)-1));
+}
+
+// a > b -> b - a is negative
+// returns 0xFFFFFFFF if true, 0x00000000 if false
+static inline uint64_t ct_64_is_greater_than(int a, int b) {
+    int64_t diff = ((int64_t) b) - ((int64_t) a);
+    return (uint64_t) (diff >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00000000, else 0xFFFFFFFF
+static inline uint32_t ct_compare_32(int a, int b) {
+    return (uint32_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+// if a == b -> 0x0000000000000000, else 0xFFFFFFFFFFFFFFFF
+static inline uint64_t ct_compare_64(int a, int b) {
+    return (uint64_t)((-(int64_t)(a ^ b)) >> (8*sizeof(uint64_t)-1));
+}
+
+// if a == b -> 0x00, else 0xFF
+static inline unsigned char ct_compare_8(unsigned char a, unsigned char b) {
+    return (int8_t)((-(int32_t)(a ^ b)) >> (8*sizeof(uint32_t)-1));
+}
+
+#include <oqs/common.h>
+/**
+ * Clears and frees allocated memory.
+ * 
+ * @param[out] mem Memory to be cleared and freed.
+ * @param size Size of memory to be cleared and freed.
+ */
+static inline void mayo_secure_free(void *mem, size_t size) {
+    OQS_MEM_secure_free(mem, size);
+}
+
+/**
+ * Clears memory.
+ * 
+ * @param[out] mem Memory to be cleared.
+ * @param size Size of memory to be cleared.
+ */
+static inline void mayo_secure_clear(void *mem, size_t size) {
+    OQS_MEM_cleanse(mem, size);
+}
+
+#endif
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/params.c b/src/sig/mayo/pqmayo_mayo-5_opt/params.c
new file mode 100644
index 0000000000..043d4cedc1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/params.c
@@ -0,0 +1,42 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#include <mayo.h>
+
+#ifdef ENABLE_PARAMS_DYNAMIC
+static const unsigned char f_tail_64[] = F_TAIL_64;
+static const unsigned char f_tail_96[] = F_TAIL_96;
+static const unsigned char f_tail_128[] = F_TAIL_128;
+
+#define MAYO_GEN_PARAMS(nm) \
+  const mayo_params_t nm = { \
+    .m = PARAM_JOIN2(nm, m), \
+    .n = PARAM_JOIN2(nm, n), \
+    .o = PARAM_JOIN2(nm, o), \
+    .k = PARAM_JOIN2(nm, k), \
+    .q = PARAM_JOIN2(nm, q), \
+    .f_tail = PARAM_JOIN2(nm, f_tail_arr), \
+    .m_bytes = PARAM_JOIN2(nm, m_bytes), \
+    .O_bytes = PARAM_JOIN2(nm, O_bytes), \
+    .v_bytes = PARAM_JOIN2(nm, v_bytes), \
+    .r_bytes = PARAM_JOIN2(nm, r_bytes), \
+    .P1_bytes = PARAM_JOIN2(nm, P1_bytes), \
+    .P2_bytes = PARAM_JOIN2(nm, P2_bytes), \
+    .P3_bytes = PARAM_JOIN2(nm, P3_bytes), \
+    .csk_bytes = PARAM_JOIN2(nm, csk_bytes), \
+    .esk_bytes = PARAM_JOIN2(nm, esk_bytes), \
+    .cpk_bytes = PARAM_JOIN2(nm, cpk_bytes), \
+    .epk_bytes = PARAM_JOIN2(nm, epk_bytes), \
+    .sig_bytes = PARAM_JOIN2(nm, sig_bytes), \
+    .salt_bytes = PARAM_JOIN2(nm, salt_bytes), \
+    .sk_seed_bytes = PARAM_JOIN2(nm, sk_seed_bytes), \
+    .digest_bytes = PARAM_JOIN2(nm, digest_bytes), \
+    .pk_seed_bytes = PARAM_JOIN2(nm, pk_seed_bytes), \
+    .name = #nm \
+  };
+
+MAYO_GEN_PARAMS(MAYO_1);
+MAYO_GEN_PARAMS(MAYO_2);
+MAYO_GEN_PARAMS(MAYO_3);
+MAYO_GEN_PARAMS(MAYO_5);
+#endif
+
diff --git a/src/sig/mayo/pqmayo_mayo-5_opt/simple_arithmetic.h b/src/sig/mayo/pqmayo_mayo-5_opt/simple_arithmetic.h
new file mode 100644
index 0000000000..ce7580f4c1
--- /dev/null
+++ b/src/sig/mayo/pqmayo_mayo-5_opt/simple_arithmetic.h
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef SIMPLE_ARITHMETIC_H
+#define SIMPLE_ARITHMETIC_H
+
+// GF(16) multiplication mod x^4 + x + 1
+static inline unsigned char mul_f(unsigned char a, unsigned char b) {
+    // carryless multiply
+    unsigned char p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    unsigned char top_p = p & 0xf0;
+    unsigned char out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f;
+    return out;
+}
+
+static inline uint64_t mul_fx8(unsigned char a, uint64_t b) {
+    // carryless multiply
+    uint64_t p;
+    p  = (a & 1)*b;
+    p ^= (a & 2)*b;
+    p ^= (a & 4)*b;
+    p ^= (a & 8)*b;
+
+    // reduce mod x^4 + x + 1
+    uint64_t top_p = p & 0xf0f0f0f0f0f0f0f0;
+    uint64_t out = (p ^ (top_p >> 4) ^ (top_p >> 3)) & 0x0f0f0f0f0f0f0f0f;
+    return out;
+}
+
+// GF(16) addition
+static inline unsigned char add_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) subtraction
+static inline unsigned char sub_f(unsigned char a, unsigned char b) {
+    return a ^ b;
+}
+
+// GF(16) negation
+static inline unsigned char neg_f(unsigned char a) {
+    return a;
+}
+
+static inline unsigned char inverse_f(unsigned char a) {
+    // static unsigned char table[16] = {0, 1, 9, 14, 13, 11, 7, 6, 15, 2, 12, 5,
+    // 10, 4, 3, 8}; return table[a & 15];
+
+    unsigned char a2 = mul_f(a, a);
+    unsigned char a4 = mul_f(a2, a2);
+    unsigned char a8 = mul_f(a4, a4);
+    unsigned char a6 = mul_f(a2, a4);
+    unsigned char a14 = mul_f(a8, a6);
+
+    return a14;
+}
+
+static inline unsigned char lincomb(const unsigned char *a,
+                                    const unsigned char *b, int n, int m) {
+    unsigned char ret = 0;
+    for (int i = 0; i < n; ++i, b += m) {
+        ret = add_f(mul_f(a[i], *b), ret);
+    }
+    return ret;
+}
+
+static inline void mat_mul(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int colrow_ab, int row_a, int col_b) {
+    for (int i = 0; i < row_a; ++i, a += colrow_ab) {
+        for (int j = 0; j < col_b; ++j, ++c) {
+            *c = lincomb(a, b + j, colrow_ab, col_b);
+        }
+    }
+}
+
+static inline void mat_add(const unsigned char *a, const unsigned char *b,
+                    unsigned char *c, int m, int n) {
+    for (int i = 0; i < m; ++i) {
+        for (int j = 0; j < n; ++j) {
+            *(c + i * n + j) = add_f(*(a + i * n + j), *(b + i * n + j));
+        }
+    }
+}
+
+static
+inline void m_vec_copy(int m_legs, const uint64_t *in,
+                                 uint64_t *out) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        out[i] = in[i];
+    }
+}
+
+static
+inline void m_vec_add(int m_legs, const uint64_t *in,
+                                uint64_t *acc) {
+    for (int i = 0; i < m_legs * 2; i++) {
+        acc[i] ^= in[i];
+    }
+}
+
+// This implements arithmetic for nibble-packed vectors of m field elements in Z_2[x]/(x^4+x+1)
+// gf16 := gf2[x]/(x^4+x+1)
+
+static inline uint32_t gf16v_mul_u32(uint32_t a, uint8_t b) {
+    uint32_t a_msb;
+    uint32_t a32 = a;
+    uint32_t b32 = b;
+    uint32_t r32 = a32 * (b32 & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 1) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 2) & 1);
+
+    a_msb = a32 & 0x88888888; // MSB, 3rd bits
+    a32 ^= a_msb;   // clear MSB
+    a32 = (a32 << 1) ^ ((a_msb >> 3) * 3);
+    r32 ^= (a32) * ((b32 >> 3) & 1);
+
+    return r32;
+
+}
+ 
+static inline void m_vec_mul_add(int m_legs, const uint64_t *in, unsigned char a, uint64_t *acc) {
+    for(int i=0; i < m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], a);        
+    }
+}
+
+static inline void m_vec_mul_add_x(int m_legs, const uint64_t *in, uint64_t *acc) {
+    for(int i=0;i<m_legs*2;i++){
+        acc[i] ^= gf16v_mul_u64(in[i], 0x2);
+    }
+}
+
+#endif
+
diff --git a/src/sig/mayo/sig_mayo.h b/src/sig/mayo/sig_mayo.h
new file mode 100644
index 0000000000..08717fb4d7
--- /dev/null
+++ b/src/sig/mayo/sig_mayo.h
@@ -0,0 +1,52 @@
+// SPDX-License-Identifier: MIT
+
+#ifndef OQS_SIG_MAYO_H
+#define OQS_SIG_MAYO_H
+
+#include <oqs/oqs.h>
+
+#if defined(OQS_ENABLE_SIG_mayo_1)
+#define OQS_SIG_mayo_1_length_public_key 1168
+#define OQS_SIG_mayo_1_length_secret_key 24
+#define OQS_SIG_mayo_1_length_signature 321
+
+OQS_SIG *OQS_SIG_mayo_1_new(void);
+OQS_API OQS_STATUS OQS_SIG_mayo_1_keypair(uint8_t *public_key, uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_mayo_1_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_mayo_1_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
+#endif
+
+#if defined(OQS_ENABLE_SIG_mayo_2)
+#define OQS_SIG_mayo_2_length_public_key 5488
+#define OQS_SIG_mayo_2_length_secret_key 24
+#define OQS_SIG_mayo_2_length_signature 180
+
+OQS_SIG *OQS_SIG_mayo_2_new(void);
+OQS_API OQS_STATUS OQS_SIG_mayo_2_keypair(uint8_t *public_key, uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_mayo_2_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_mayo_2_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
+#endif
+
+#if defined(OQS_ENABLE_SIG_mayo_3)
+#define OQS_SIG_mayo_3_length_public_key 2656
+#define OQS_SIG_mayo_3_length_secret_key 32
+#define OQS_SIG_mayo_3_length_signature 577
+
+OQS_SIG *OQS_SIG_mayo_3_new(void);
+OQS_API OQS_STATUS OQS_SIG_mayo_3_keypair(uint8_t *public_key, uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_mayo_3_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_mayo_3_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
+#endif
+
+#if defined(OQS_ENABLE_SIG_mayo_5)
+#define OQS_SIG_mayo_5_length_public_key 5008
+#define OQS_SIG_mayo_5_length_secret_key 40
+#define OQS_SIG_mayo_5_length_signature 838
+
+OQS_SIG *OQS_SIG_mayo_5_new(void);
+OQS_API OQS_STATUS OQS_SIG_mayo_5_keypair(uint8_t *public_key, uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_mayo_5_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key);
+OQS_API OQS_STATUS OQS_SIG_mayo_5_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key);
+#endif
+
+#endif
diff --git a/src/sig/mayo/sig_mayo_1.c b/src/sig/mayo/sig_mayo_1.c
new file mode 100644
index 0000000000..93034eb5ef
--- /dev/null
+++ b/src/sig/mayo/sig_mayo_1.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+
+#include <stdlib.h>
+
+#include <oqs/sig_mayo.h>
+
+#if defined(OQS_ENABLE_SIG_mayo_1)
+
+OQS_SIG *OQS_SIG_mayo_1_new(void) {
+
+	OQS_SIG *sig = malloc(sizeof(OQS_SIG));
+	if (sig == NULL) {
+		return NULL;
+	}
+	sig->method_name = OQS_SIG_alg_mayo_1;
+	sig->alg_version = "https://github.com/PQCMayo/MAYO-C/tree/nibbling-mayo";
+
+	sig->claimed_nist_level = 1;
+	sig->euf_cma = true;
+
+	sig->length_public_key = OQS_SIG_mayo_1_length_public_key;
+	sig->length_secret_key = OQS_SIG_mayo_1_length_secret_key;
+	sig->length_signature = OQS_SIG_mayo_1_length_signature;
+
+	sig->keypair = OQS_SIG_mayo_1_keypair;
+	sig->sign = OQS_SIG_mayo_1_sign;
+	sig->verify = OQS_SIG_mayo_1_verify;
+
+	return sig;
+}
+
+extern int pqmayo_MAYO_1_opt_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int pqmayo_MAYO_1_opt_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int pqmayo_MAYO_1_opt_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+#if defined(OQS_ENABLE_SIG_mayo_1_avx2)
+extern int pqmayo_MAYO_1_avx2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int pqmayo_MAYO_1_avx2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int pqmayo_MAYO_1_avx2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+OQS_API OQS_STATUS OQS_SIG_mayo_1_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_mayo_1_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_1_avx2_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_keypair(public_key, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_keypair(public_key, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_mayo_1_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_mayo_1_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_1_avx2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_mayo_1_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) {
+#if defined(OQS_ENABLE_SIG_mayo_1_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_1_avx2_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_1_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#endif
+}
+
+#endif
diff --git a/src/sig/mayo/sig_mayo_2.c b/src/sig/mayo/sig_mayo_2.c
new file mode 100644
index 0000000000..25719f0aee
--- /dev/null
+++ b/src/sig/mayo/sig_mayo_2.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+
+#include <stdlib.h>
+
+#include <oqs/sig_mayo.h>
+
+#if defined(OQS_ENABLE_SIG_mayo_2)
+
+OQS_SIG *OQS_SIG_mayo_2_new(void) {
+
+	OQS_SIG *sig = malloc(sizeof(OQS_SIG));
+	if (sig == NULL) {
+		return NULL;
+	}
+	sig->method_name = OQS_SIG_alg_mayo_2;
+	sig->alg_version = "https://github.com/PQCMayo/MAYO-C/tree/nibbling-mayo";
+
+	sig->claimed_nist_level = 1;
+	sig->euf_cma = true;
+
+	sig->length_public_key = OQS_SIG_mayo_2_length_public_key;
+	sig->length_secret_key = OQS_SIG_mayo_2_length_secret_key;
+	sig->length_signature = OQS_SIG_mayo_2_length_signature;
+
+	sig->keypair = OQS_SIG_mayo_2_keypair;
+	sig->sign = OQS_SIG_mayo_2_sign;
+	sig->verify = OQS_SIG_mayo_2_verify;
+
+	return sig;
+}
+
+extern int pqmayo_MAYO_2_opt_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int pqmayo_MAYO_2_opt_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int pqmayo_MAYO_2_opt_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+#if defined(OQS_ENABLE_SIG_mayo_2_avx2)
+extern int pqmayo_MAYO_2_avx2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int pqmayo_MAYO_2_avx2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int pqmayo_MAYO_2_avx2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+OQS_API OQS_STATUS OQS_SIG_mayo_2_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_mayo_2_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_2_avx2_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_keypair(public_key, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_keypair(public_key, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_mayo_2_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_mayo_2_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_2_avx2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_mayo_2_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) {
+#if defined(OQS_ENABLE_SIG_mayo_2_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_2_avx2_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_2_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#endif
+}
+
+#endif
diff --git a/src/sig/mayo/sig_mayo_3.c b/src/sig/mayo/sig_mayo_3.c
new file mode 100644
index 0000000000..7a68024880
--- /dev/null
+++ b/src/sig/mayo/sig_mayo_3.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+
+#include <stdlib.h>
+
+#include <oqs/sig_mayo.h>
+
+#if defined(OQS_ENABLE_SIG_mayo_3)
+
+OQS_SIG *OQS_SIG_mayo_3_new(void) {
+
+	OQS_SIG *sig = malloc(sizeof(OQS_SIG));
+	if (sig == NULL) {
+		return NULL;
+	}
+	sig->method_name = OQS_SIG_alg_mayo_3;
+	sig->alg_version = "https://github.com/PQCMayo/MAYO-C/tree/nibbling-mayo";
+
+	sig->claimed_nist_level = 3;
+	sig->euf_cma = true;
+
+	sig->length_public_key = OQS_SIG_mayo_3_length_public_key;
+	sig->length_secret_key = OQS_SIG_mayo_3_length_secret_key;
+	sig->length_signature = OQS_SIG_mayo_3_length_signature;
+
+	sig->keypair = OQS_SIG_mayo_3_keypair;
+	sig->sign = OQS_SIG_mayo_3_sign;
+	sig->verify = OQS_SIG_mayo_3_verify;
+
+	return sig;
+}
+
+extern int pqmayo_MAYO_3_opt_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int pqmayo_MAYO_3_opt_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int pqmayo_MAYO_3_opt_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+#if defined(OQS_ENABLE_SIG_mayo_3_avx2)
+extern int pqmayo_MAYO_3_avx2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int pqmayo_MAYO_3_avx2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int pqmayo_MAYO_3_avx2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+OQS_API OQS_STATUS OQS_SIG_mayo_3_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_mayo_3_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_3_avx2_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_keypair(public_key, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_keypair(public_key, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_mayo_3_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_mayo_3_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_3_avx2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_mayo_3_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) {
+#if defined(OQS_ENABLE_SIG_mayo_3_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_3_avx2_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_3_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#endif
+}
+
+#endif
diff --git a/src/sig/mayo/sig_mayo_5.c b/src/sig/mayo/sig_mayo_5.c
new file mode 100644
index 0000000000..ca19cfbf91
--- /dev/null
+++ b/src/sig/mayo/sig_mayo_5.c
@@ -0,0 +1,90 @@
+// SPDX-License-Identifier: MIT
+
+#include <stdlib.h>
+
+#include <oqs/sig_mayo.h>
+
+#if defined(OQS_ENABLE_SIG_mayo_5)
+
+OQS_SIG *OQS_SIG_mayo_5_new(void) {
+
+	OQS_SIG *sig = malloc(sizeof(OQS_SIG));
+	if (sig == NULL) {
+		return NULL;
+	}
+	sig->method_name = OQS_SIG_alg_mayo_5;
+	sig->alg_version = "https://github.com/PQCMayo/MAYO-C/tree/nibbling-mayo";
+
+	sig->claimed_nist_level = 5;
+	sig->euf_cma = true;
+
+	sig->length_public_key = OQS_SIG_mayo_5_length_public_key;
+	sig->length_secret_key = OQS_SIG_mayo_5_length_secret_key;
+	sig->length_signature = OQS_SIG_mayo_5_length_signature;
+
+	sig->keypair = OQS_SIG_mayo_5_keypair;
+	sig->sign = OQS_SIG_mayo_5_sign;
+	sig->verify = OQS_SIG_mayo_5_verify;
+
+	return sig;
+}
+
+extern int pqmayo_MAYO_5_opt_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int pqmayo_MAYO_5_opt_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int pqmayo_MAYO_5_opt_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+
+#if defined(OQS_ENABLE_SIG_mayo_5_avx2)
+extern int pqmayo_MAYO_5_avx2_crypto_sign_keypair(uint8_t *pk, uint8_t *sk);
+extern int pqmayo_MAYO_5_avx2_crypto_sign_signature(uint8_t *sig, size_t *siglen, const uint8_t *m, size_t mlen, const uint8_t *sk);
+extern int pqmayo_MAYO_5_avx2_crypto_sign_verify(const uint8_t *sig, size_t siglen, const uint8_t *m, size_t mlen, const uint8_t *pk);
+#endif
+
+OQS_API OQS_STATUS OQS_SIG_mayo_5_keypair(uint8_t *public_key, uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_mayo_5_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_keypair(public_key, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_keypair(public_key, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_keypair(public_key, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_mayo_5_sign(uint8_t *signature, size_t *signature_len, const uint8_t *message, size_t message_len, const uint8_t *secret_key) {
+#if defined(OQS_ENABLE_SIG_mayo_5_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_signature(signature, signature_len, message, message_len, secret_key);
+#endif
+}
+
+OQS_API OQS_STATUS OQS_SIG_mayo_5_verify(const uint8_t *message, size_t message_len, const uint8_t *signature, size_t signature_len, const uint8_t *public_key) {
+#if defined(OQS_ENABLE_SIG_mayo_5_avx2)
+#if defined(OQS_DIST_BUILD)
+	if (OQS_CPU_has_extension(OQS_CPU_EXT_AVX2)) {
+#endif /* OQS_DIST_BUILD */
+		return (OQS_STATUS) pqmayo_MAYO_5_avx2_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#if defined(OQS_DIST_BUILD)
+	} else {
+		return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+	}
+#endif /* OQS_DIST_BUILD */
+#else
+	return (OQS_STATUS) pqmayo_MAYO_5_opt_crypto_sign_verify(signature, signature_len, message, message_len, public_key);
+#endif
+}
+
+#endif
diff --git a/src/sig/sig.c b/src/sig/sig.c
index ae41478387..bab752c607 100644
--- a/src/sig/sig.c
+++ b/src/sig/sig.c
@@ -39,7 +39,11 @@ OQS_API const char *OQS_SIG_alg_identifier(size_t i) {
 		OQS_SIG_alg_sphincs_shake_192f_simple,
 		OQS_SIG_alg_sphincs_shake_192s_simple,
 		OQS_SIG_alg_sphincs_shake_256f_simple,
-		OQS_SIG_alg_sphincs_shake_256s_simple,///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END
+		OQS_SIG_alg_sphincs_shake_256s_simple,
+		OQS_SIG_alg_mayo_1,
+		OQS_SIG_alg_mayo_2,
+		OQS_SIG_alg_mayo_3,
+		OQS_SIG_alg_mayo_5,///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END
 	};
 	if (i >= OQS_SIG_algs_length) {
 		return NULL;
@@ -232,6 +236,34 @@ OQS_API int OQS_SIG_alg_is_enabled(const char *method_name) {
 #else
 		return 0;
 #endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_1)) {
+#ifdef OQS_ENABLE_SIG_mayo_1
+		return 1;
+#else
+		return 0;
+#endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_2)) {
+#ifdef OQS_ENABLE_SIG_mayo_2
+		return 1;
+#else
+		return 0;
+#endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_3)) {
+#ifdef OQS_ENABLE_SIG_mayo_3
+		return 1;
+#else
+		return 0;
+#endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_5)) {
+#ifdef OQS_ENABLE_SIG_mayo_5
+		return 1;
+#else
+		return 0;
+#endif
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ENABLED_CASE_END
 	} else {
 		return 0;
@@ -418,6 +450,34 @@ OQS_API OQS_SIG *OQS_SIG_new(const char *method_name) {
 #else
 		return NULL;
 #endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_1)) {
+#ifdef OQS_ENABLE_SIG_mayo_1
+		return OQS_SIG_mayo_1_new();
+#else
+		return NULL;
+#endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_2)) {
+#ifdef OQS_ENABLE_SIG_mayo_2
+		return OQS_SIG_mayo_2_new();
+#else
+		return NULL;
+#endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_3)) {
+#ifdef OQS_ENABLE_SIG_mayo_3
+		return OQS_SIG_mayo_3_new();
+#else
+		return NULL;
+#endif
+
+	} else if (0 == strcasecmp(method_name, OQS_SIG_alg_mayo_5)) {
+#ifdef OQS_ENABLE_SIG_mayo_5
+		return OQS_SIG_mayo_5_new();
+#else
+		return NULL;
+#endif
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_NEW_CASE_END
 		// EDIT-WHEN-ADDING-SIG
 	} else {
diff --git a/src/sig/sig.h b/src/sig/sig.h
index 6e3c3951c5..bb2a738215 100644
--- a/src/sig/sig.h
+++ b/src/sig/sig.h
@@ -82,12 +82,20 @@ extern "C" {
 #define OQS_SIG_alg_sphincs_shake_256f_simple "SPHINCS+-SHAKE-256f-simple"
 /** Algorithm identifier for SPHINCS+-SHAKE-256s-simple */
 #define OQS_SIG_alg_sphincs_shake_256s_simple "SPHINCS+-SHAKE-256s-simple"
+/** Algorithm identifier for MAYO-1 */
+#define OQS_SIG_alg_mayo_1 "MAYO-1"
+/** Algorithm identifier for MAYO-2 */
+#define OQS_SIG_alg_mayo_2 "MAYO-2"
+/** Algorithm identifier for MAYO-3 */
+#define OQS_SIG_alg_mayo_3 "MAYO-3"
+/** Algorithm identifier for MAYO-5 */
+#define OQS_SIG_alg_mayo_5 "MAYO-5"
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALG_IDENTIFIER_END
 // EDIT-WHEN-ADDING-SIG
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_START
 
 /** Number of algorithm identifiers above. */
-#define OQS_SIG_algs_length 25
+#define OQS_SIG_algs_length 29
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_ALGS_LENGTH_END
 
 /**
@@ -266,6 +274,9 @@ OQS_API void OQS_SIG_free(OQS_SIG *sig);
 #ifdef OQS_ENABLE_SIG_SPHINCS
 #include <oqs/sig_sphincs.h>
 #endif /* OQS_ENABLE_SIG_SPHINCS */
+#ifdef OQS_ENABLE_SIG_MAYO
+#include <oqs/sig_mayo.h>
+#endif /* OQS_ENABLE_SIG_MAYO */
 ///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_INCLUDE_END
 // EDIT-WHEN-ADDING-SIG
 
diff --git a/tests/KATs/sig/kats.json b/tests/KATs/sig/kats.json
index e60fe897ba..918b893659 100644
--- a/tests/KATs/sig/kats.json
+++ b/tests/KATs/sig/kats.json
@@ -27,6 +27,22 @@
     "all": "362ecc0537ca1fe25143fb7ccb04de8ee7703469d13ebcf311ab124a5c374a65",
     "single": "91842d41138e7cfaf6e2e8f12a03c3b3411302255121e4d07d02f91a003c0395"
   },
+  "MAYO-1": {
+    "all": "5cf156cf74fc65b43863399ecc4b26ad7b4b3b2cd8485215dc0c011e2825b145",
+    "single": "ba2473dedd92cf3b8a1fc14fc22f2ffdde972c8b64cfcd8cddb4f803e48df017"
+  },
+  "MAYO-2": {
+    "all": "c0daf74b54fae78685c87b32d3b36a418bac884c3564ea96d56c6601b138d449",
+    "single": "72cb237642b2c0c4e7f8c824d9c8601ac7189784649d28dbb2cccfb94732c9a3"
+  },
+  "MAYO-3": {
+    "all": "f66b95dda153b7df00610aa018f0644146e7e564b33562c51bb088c40fb0dcb2",
+    "single": "dbc49f4fdfa0de69d416051215cb53c042c4a329d325452d079f3734b7467a6b"
+  },
+  "MAYO-5": {
+    "all": "7b230c2626f57159a243d8dfc69c62cb94dd0f179dd2b4f2ef3606deb6404477",
+    "single": "f2c1c69045c7d15e714a04119965e8a7007ef54f9293158587560227c97b237d"
+  },
   "ML-DSA-44": {
     "all": "183bc0c4398ade4fc17b6a7d876b82545a96331139a4f27269c95664b8c483f9",
     "single": "e6f3ec4dc0b02dd3bcbbc6b105190e1890ca0bb3f802e2b571f0d70f3993a2e1"
diff --git a/tests/constant_time/sig/issues.json b/tests/constant_time/sig/issues.json
index 7eb295ffa0..b5ea3f5a1d 100644
--- a/tests/constant_time/sig/issues.json
+++ b/tests/constant_time/sig/issues.json
@@ -7,6 +7,9 @@
   "Falcon-512": ["falcon"],
   "Falcon-padded-1024": ["falcon"],
   "Falcon-padded-512": ["falcon"],
+  "MAYO_1": [],
+  "MAYO_2": [],
+  "MAYO_3": [],
   "ML-DSA-44-ipd": [],
   "ML-DSA-65-ipd": [],
   "ML-DSA-87-ipd": [],
diff --git a/tests/constant_time/sig/passes.json b/tests/constant_time/sig/passes.json
index a6096eb640..4803e636b9 100644
--- a/tests/constant_time/sig/passes.json
+++ b/tests/constant_time/sig/passes.json
@@ -7,6 +7,9 @@
   "Falcon-512": ["falcon_keygen", "falcon_sign"],
   "Falcon-padded-1024": ["falcon_keygen", "falcon_sign"],
   "Falcon-padded-512": ["falcon_keygen", "falcon_sign"],
+  "MAYO_1": ["mayo"],
+  "MAYO_2": ["mayo"],
+  "MAYO_3": ["mayo"],
   "ML-DSA-44-ipd": ["ml_dsa", "ml_dsa-avx2"],
   "ML-DSA-65-ipd": ["ml_dsa", "ml_dsa-avx2"],
   "ML-DSA-87-ipd": ["ml_dsa", "ml_dsa-avx2"],
diff --git a/tests/constant_time/sig/passes/mayo b/tests/constant_time/sig/passes/mayo
new file mode 100644
index 0000000000..9a97a98ae3
--- /dev/null
+++ b/tests/constant_time/sig/passes/mayo
@@ -0,0 +1,5 @@
+{
+	Restart in case no solution x to Ax = y, with r used as randomness was found
+	Memcheck:Cond
+	src:arithmetic.c:282 # fun:pqmayo_MAYO_*sample_solution
+}
\ No newline at end of file
diff --git a/tests/kat_sig.c b/tests/kat_sig.c
index 21c208f3a5..ffb0456920 100644
--- a/tests/kat_sig.c
+++ b/tests/kat_sig.c
@@ -272,6 +272,46 @@ OQS_STATUS combine_message_signature(uint8_t **signed_msg, size_t *signed_msg_le
 		memcpy(*signed_msg, signature, signature_len);
 		memcpy(*signed_msg + signature_len, msg, msg_len);
 		return OQS_SUCCESS;
+	} else if (0 == strcmp(sig->method_name, "MAYO-1")) {
+		// signed_msg = signature || msg
+		*signed_msg_len = signature_len + msg_len;
+		*signed_msg = malloc(*signed_msg_len);
+		if (*signed_msg == NULL) {
+			return OQS_ERROR;
+		}
+		memcpy(*signed_msg, signature, signature_len);
+		memcpy(*signed_msg + signature_len, msg, msg_len);
+		return OQS_SUCCESS;
+	} else if (0 == strcmp(sig->method_name, "MAYO-2")) {
+		// signed_msg = signature || msg
+		*signed_msg_len = signature_len + msg_len;
+		*signed_msg = malloc(*signed_msg_len);
+		if (*signed_msg == NULL) {
+			return OQS_ERROR;
+		}
+		memcpy(*signed_msg, signature, signature_len);
+		memcpy(*signed_msg + signature_len, msg, msg_len);
+		return OQS_SUCCESS;
+	} else if (0 == strcmp(sig->method_name, "MAYO-3")) {
+		// signed_msg = signature || msg
+		*signed_msg_len = signature_len + msg_len;
+		*signed_msg = malloc(*signed_msg_len);
+		if (*signed_msg == NULL) {
+			return OQS_ERROR;
+		}
+		memcpy(*signed_msg, signature, signature_len);
+		memcpy(*signed_msg + signature_len, msg, msg_len);
+		return OQS_SUCCESS;
+	} else if (0 == strcmp(sig->method_name, "MAYO-5")) {
+		// signed_msg = signature || msg
+		*signed_msg_len = signature_len + msg_len;
+		*signed_msg = malloc(*signed_msg_len);
+		if (*signed_msg == NULL) {
+			return OQS_ERROR;
+		}
+		memcpy(*signed_msg, signature, signature_len);
+		memcpy(*signed_msg + signature_len, msg, msg_len);
+		return OQS_SUCCESS;
 		///// OQS_COPY_FROM_UPSTREAM_FRAGMENT_COMBINE_MESSAGE_SIGNATURE_END
 	} else {
 		return OQS_ERROR;
diff --git a/tests/test_aes.c b/tests/test_aes.c
index 4ba265c690..bf1f5aed39 100644
--- a/tests/test_aes.c
+++ b/tests/test_aes.c
@@ -52,6 +52,30 @@ static int test_aes128_correctness(void) {
 	return EXIT_SUCCESS;
 }
 
+// test vector #3 from https://tools.ietf.org/html/rfc3686#section-6
+static const uint8_t test_aes128ctr_key[] = {0x76, 0x91, 0xBE, 0x03, 0x5E, 0x50, 0x20, 0xA8, 0xAC, 0x6E, 0x61, 0x85, 0x29, 0xF9, 0xA0, 0xDC};
+static const uint8_t test_aes128ctr_iv[] = {0x00, 0xE0, 0x01, 0x7B, 0x27, 0x77, 0x7F, 0x3F, 0x4A, 0x17, 0x86, 0xF0, 0x00, 0x00, 0x00, 0x01};
+static const uint8_t test_aes128ctr_plaintext[] = {0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F, 0x20, 0x21, 0x22, 0x23};
+static const uint8_t test_aes128ctr_ciphertext[] = {0xC1, 0xCF, 0x48, 0xA8, 0x9F, 0x2F, 0xFD, 0xD9, 0xCF, 0x46, 0x52, 0xE9, 0xEF, 0xDB, 0x72, 0xD7, 0x45, 0x40, 0xA4, 0x2B, 0xDE, 0x6D, 0x78, 0x36, 0xD5, 0x9A, 0x5C, 0xEA, 0xAE, 0xF3, 0x10, 0x53, 0x25, 0xB2, 0x07, 0x2F};
+
+static int test_aes128ctr_correctness(void) {
+	uint8_t derived_ciphertext[36];
+	void *schedule = NULL;
+	OQS_AES128_CTR_inc_init(test_aes128ctr_key, &schedule);
+	OQS_AES128_CTR_inc_stream_iv(test_aes128ctr_iv, sizeof(test_aes128ctr_iv), schedule, derived_ciphertext, sizeof(derived_ciphertext));
+	for (size_t i = 0; i < sizeof(derived_ciphertext); i++) {
+		derived_ciphertext[i] ^= test_aes128ctr_plaintext[i];
+	}
+	if (memcmp(test_aes128ctr_ciphertext, derived_ciphertext, 36) != 0) {
+		printf("test_aes128ctr_correctness ciphertext does not match\n");
+		OQS_print_hex_string("expected ciphertext", test_aes128ctr_ciphertext, 36);
+		OQS_print_hex_string("derived  ciphertext", derived_ciphertext, 36);
+		return EXIT_FAILURE;
+	}
+	OQS_AES128_free_schedule(schedule);
+	return EXIT_SUCCESS;
+}
+
 static int test_aes256_correctness(void) {
 	uint8_t derived_ciphertext[16];
 	void *schedule = NULL;
@@ -159,6 +183,10 @@ int main(int argc, char **argv) {
 		OQS_destroy();
 		return EXIT_FAILURE;
 	}
+	if (test_aes128ctr_correctness() != EXIT_SUCCESS) {
+		OQS_destroy();
+		return EXIT_FAILURE;
+	}
 
 	if (test_aes256_correctness() != EXIT_SUCCESS) {
 		OQS_destroy();
diff --git a/tests/test_binary.py b/tests/test_binary.py
index d212f416d3..1c33093ae7 100644
--- a/tests/test_binary.py
+++ b/tests/test_binary.py
@@ -33,7 +33,7 @@ def test_namespace():
             symbols.append(line)
 
     # ideally this would be just ['oqs', 'pqclean'], but contains exceptions (e.g., providing compat implementations of unavailable platform functions)
-    namespaces = ['oqs', 'pqclean', 'keccak', 'pqcrystals', 'init', 'fini', 'seedexpander', '__x86.get_pc_thunk']
+    namespaces = ['oqs', 'pqclean', 'keccak', 'pqcrystals', 'pqmayo', 'init', 'fini', 'seedexpander', '__x86.get_pc_thunk']
     non_namespaced = []
 
     for symbolstr in symbols:
diff --git a/tests/test_sig.c b/tests/test_sig.c
index 90990adad2..185ef169c9 100644
--- a/tests/test_sig.c
+++ b/tests/test_sig.c
@@ -224,17 +224,30 @@ int main(int argc, char **argv) {
 	OQS_STATUS rc;
 #if OQS_USE_PTHREADS
 #define MAX_LEN_SIG_NAME_ 64
-	pthread_t thread;
-	struct thread_data td;
-	td.alg_name = alg_name;
-	int trc = pthread_create(&thread, NULL, test_wrapper, &td);
-	if (trc) {
-		fprintf(stderr, "ERROR: Creating pthread\n");
-		OQS_destroy();
-		return EXIT_FAILURE;
+	// don't run MAYO_5 in threads because of large stack usage
+	char no_thread_sig_patterns[][MAX_LEN_SIG_NAME_]  = {"MAYO-5"};
+	int test_in_thread = 1;
+	for (size_t i = 0 ; i < sizeof(no_thread_sig_patterns) / MAX_LEN_SIG_NAME_; ++i) {
+		if (strstr(alg_name, no_thread_sig_patterns[i]) != NULL) {
+			test_in_thread = 0;
+			break;
+		}
+	}
+	if (test_in_thread) {
+		pthread_t thread;
+		struct thread_data td;
+		td.alg_name = alg_name;
+		int trc = pthread_create(&thread, NULL, test_wrapper, &td);
+		if (trc) {
+			fprintf(stderr, "ERROR: Creating pthread\n");
+			OQS_destroy();
+			return EXIT_FAILURE;
+		}
+		pthread_join(thread, NULL);
+		rc = td.rc;
+	} else {
+		rc = sig_test_correctness(alg_name);
 	}
-	pthread_join(thread, NULL);
-	rc = td.rc;
 #else
 	rc = sig_test_correctness(alg_name);
 #endif
diff --git a/zephyr/CMakeLists.txt b/zephyr/CMakeLists.txt
index ff772d895e..29be2f9d55 100644
--- a/zephyr/CMakeLists.txt
+++ b/zephyr/CMakeLists.txt
@@ -121,6 +121,13 @@ if(CONFIG_LIBOQS)
                 set(OQS_ENABLE_SIG_SPHINCS OFF)
         endif()
 
+        if(CONFIG_LIBOQS_ENABLE_SIG_MAYO)
+                set(OQS_ENABLE_SIG_MAYO ON)
+                set(OQS_ENABLE_SIG_mayo_5 OFF)
+        else()  
+                set(OQS_ENABLE_SIG_MAYO OFF)
+        endif()
+
         # Add the actual liboqs targets
         add_subdirectory(.. build)
 
diff --git a/zephyr/Kconfig b/zephyr/Kconfig
index e7ceb8eac8..9f34817012 100644
--- a/zephyr/Kconfig
+++ b/zephyr/Kconfig
@@ -66,4 +66,9 @@ config LIBOQS_ENABLE_SIG_SPHINCS
 	default y
 	depends on LIBOQS
 
+config LIBOQS_ENABLE_SIG_MAYO
+	bool "Enable the MAYO signature algorithm"
+	default y
+	depends on LIBOQS
+
 endmenu
diff --git a/zephyr/samples/Signatures/prj.conf b/zephyr/samples/Signatures/prj.conf
index 821b1889d3..1e0ef6749c 100644
--- a/zephyr/samples/Signatures/prj.conf
+++ b/zephyr/samples/Signatures/prj.conf
@@ -8,12 +8,12 @@ CONFIG_LIBOQS_ENABLE_SIG_DILITHIUM=y
 CONFIG_PICOLIBC=y
 CONFIG_TEST_RANDOM_GENERATOR=y
 
-# Set the stack size to 128K
-CONFIG_MAIN_STACK_SIZE=131072
+# Set the stack size to 512K
+CONFIG_MAIN_STACK_SIZE=524288
 
-# Enable malloc and set the available size to 128K
+# Enable malloc and set the available size to 256K
 CONFIG_COMMON_LIBC_MALLOC=y
-CONFIG_COMMON_LIBC_MALLOC_ARENA_SIZE=131072
+CONFIG_COMMON_LIBC_MALLOC_ARENA_SIZE=262144
 
 CONFIG_SPEED_OPTIMIZATIONS=y
 CONFIG_LOG=y
diff --git a/zephyr/samples/Signatures/sample.yaml b/zephyr/samples/Signatures/sample.yaml
index 1f9c30cc8f..601f7d4ab2 100644
--- a/zephyr/samples/Signatures/sample.yaml
+++ b/zephyr/samples/Signatures/sample.yaml
@@ -10,7 +10,7 @@ common:
 
 tests:
   sample.crypto.liboqs_signature_example:
-    timeout: 900
+    timeout: 1500
     integration_platforms:
       - qemu_x86
       - qemu_cortex_a53
diff --git a/zephyr/samples/Signatures/src/main.c b/zephyr/samples/Signatures/src/main.c
index 6e8b596596..9b2bc54822 100644
--- a/zephyr/samples/Signatures/src/main.c
+++ b/zephyr/samples/Signatures/src/main.c
@@ -171,15 +171,14 @@ int main(void)
 		const char *alg_name = OQS_SIG_alg_identifier(i);
 		if (!OQS_SIG_alg_is_enabled(alg_name)) {
 			printf("Signature algorithm %s not enabled!\n", alg_name);
-			OQS_destroy();
-			return EXIT_FAILURE;
 		}
+		else {
+			rc = sig_test_correctness(alg_name);
 
-		rc = sig_test_correctness(alg_name);
-
-		if (rc != OQS_SUCCESS) {
-			OQS_destroy();
-			return EXIT_FAILURE;
+			if (rc != OQS_SUCCESS) {
+				OQS_destroy();
+				return EXIT_FAILURE;
+			}
 		}
 	}