From 0b35736a5ed00849ea3efc37338a6bb5fd60fa73 Mon Sep 17 00:00:00 2001 From: Maamoun TK Date: Sat, 15 Jul 2023 15:38:26 +0300 Subject: [PATCH 01/10] Add support for Hacl_AES_128_GCM_NI and Hacl_AES_128_GCM_M32 --- CMakeLists.txt | 73 +- benchmarks/aesgcm.cc | 262 +++++ config/Config.h.in | 1 + config/config.json | 37 +- config/default_config.cmake | 20 +- config/toolchain.cmake | 9 + cpu-features/include/hacl-cpu-features.h | 2 + cpu-features/src/cpu-features.c | 66 +- include/Hacl_AES_128_BitSlice.h | 73 ++ include/Hacl_AES_128_GCM_M32.h | 76 ++ include/Hacl_AES_128_GCM_NI.h | 75 ++ include/Hacl_AES_128_NI.h | 81 ++ include/Hacl_Gf128_NI.h | 65 ++ include/Hacl_Gf128_PreComp.h | 54 ++ include/internal/Hacl_AES_128_BitSlice.h | 83 ++ include/internal/Hacl_Lib.h | 69 ++ include/internal/Hacl_Spec.h | 4 + include/libintvector.h | 63 +- src/EverCrypt_AEAD.c | 204 +++- src/Hacl_AES_128_BitSlice.c | 1105 ++++++++++++++++++++++ src/Hacl_AES_128_GCM_M32.c | 208 ++++ src/Hacl_AES_128_GCM_NI.c | 409 ++++++++ src/Hacl_AES_128_NI.c | 1084 +++++++++++++++++++++ src/Hacl_Gf128_NI.c | 359 +++++++ src/Hacl_Gf128_PreComp.c | 461 +++++++++ src/Hacl_Lib.c | 193 ++++ third-party/bearssl/README.md | 1 + third-party/bearssl/aes_ct64.c | 400 ++++++++ third-party/bearssl/aes_ct64_ctr.c | 117 +++ third-party/bearssl/aes_ct64_enc.c | 117 +++ third-party/bearssl/bearssl_aead.h | 410 ++++++++ third-party/bearssl/bearssl_block.h | 161 ++++ third-party/bearssl/bearssl_hash.h | 76 ++ third-party/bearssl/config.cmake | 9 + third-party/bearssl/dec32le.c | 40 + third-party/bearssl/enc32le.c | 40 + third-party/bearssl/gcm.c | 321 +++++++ third-party/bearssl/ghash_ctmul64.c | 156 +++ third-party/bearssl/inner.h | 211 +++++ tools/configure.py | 2 + 40 files changed, 7162 insertions(+), 35 deletions(-) create mode 100644 benchmarks/aesgcm.cc create mode 100644 include/Hacl_AES_128_BitSlice.h create mode 100644 include/Hacl_AES_128_GCM_M32.h create mode 100644 include/Hacl_AES_128_GCM_NI.h create mode 100644 include/Hacl_AES_128_NI.h create mode 100644 include/Hacl_Gf128_NI.h create mode 100644 include/Hacl_Gf128_PreComp.h create mode 100644 include/internal/Hacl_AES_128_BitSlice.h create mode 100644 include/internal/Hacl_Lib.h create mode 100644 src/Hacl_AES_128_BitSlice.c create mode 100644 src/Hacl_AES_128_GCM_M32.c create mode 100644 src/Hacl_AES_128_GCM_NI.c create mode 100644 src/Hacl_AES_128_NI.c create mode 100644 src/Hacl_Gf128_NI.c create mode 100644 src/Hacl_Gf128_PreComp.c create mode 100644 src/Hacl_Lib.c create mode 100644 third-party/bearssl/README.md create mode 100644 third-party/bearssl/aes_ct64.c create mode 100644 third-party/bearssl/aes_ct64_ctr.c create mode 100644 third-party/bearssl/aes_ct64_enc.c create mode 100644 third-party/bearssl/bearssl_aead.h create mode 100644 third-party/bearssl/bearssl_block.h create mode 100644 third-party/bearssl/bearssl_hash.h create mode 100644 third-party/bearssl/config.cmake create mode 100644 third-party/bearssl/dec32le.c create mode 100644 third-party/bearssl/enc32le.c create mode 100644 third-party/bearssl/gcm.c create mode 100644 third-party/bearssl/ghash_ctmul64.c create mode 100644 third-party/bearssl/inner.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 825b1192..03ee9969 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -175,6 +175,7 @@ endif() # - SOURCES_std: All regular files # - SOURCES_vec128: Files that require vec128 hardware # - SOURCES_vec256: Files that require vec256 hardware +# - SOURCES_aesni_pclmul: Files that require aes-ni/pclmul hardware # Remove files that require missing toolchain features # and enable the features for compilation that are available. @@ -284,6 +285,44 @@ if(TOOLCHAIN_CAN_COMPILE_VALE) set(HACL_CAN_COMPILE_VALE 1) endif() +if(TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL) + add_compile_options( + -DHACL_CAN_COMPILE_VEC128 + ) + set(HACL_CAN_COMPILE_AESNI_PCLMUL 1) + + # # We make separate compilation units (objects) for each hardware feature + list(LENGTH SOURCES_aesni_pclmul SOURCES_AESNI_PCLMUL_LEN) + + if(NOT SOURCES_AESNI_PCLMUL_LEN EQUAL 0) + set(HACL_AESNI_PCLMUL_O ON) + add_library(hacl_aesni_pclmul OBJECT ${SOURCES_aesni_pclmul}) + target_include_directories(hacl_aesni_pclmul PRIVATE) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i586|i686|i86pc|ia32|x86_64|amd64|AMD64") + add_compile_options( + -DHACL_CAN_COMPILE_VEC256 + ) + + if(MSVC) + # Nothing to do here. MSVC has it covered + else() + target_compile_options(hacl_aesni_pclmul PRIVATE + -msse2 + -msse3 + -msse4.1 + -msse4.2 + -maes + -mpclmul + ) + endif(MSVC) + elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|arm64v8") + target_compile_options(hacl_aesni_pclmul PRIVATE + -march=armv8-a+crypto + ) + endif() + endif() +endif() + if(TOOLCHAIN_CAN_COMPILE_INLINE_ASM) message(STATUS "Detected inline assembly support") set(HACL_CAN_COMPILE_INLINE_ASM 1) @@ -348,6 +387,11 @@ if(TOOLCHAIN_CAN_COMPILE_VEC256 AND HACL_VEC256_O) target_link_libraries(hacl PRIVATE $) endif() +if(TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL AND HACL_AESNI_PCLMUL_O) + add_dependencies(hacl hacl_aesni_pclmul) + target_link_libraries(hacl PRIVATE $) +endif() + # # Static library add_library(hacl_static STATIC ${SOURCES_std} ${VALE_OBJECTS}) @@ -359,6 +403,10 @@ if(TOOLCHAIN_CAN_COMPILE_VEC256 AND HACL_VEC256_O) target_sources(hacl_static PRIVATE $) endif() +if(TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL AND HACL_AESNI_PCLMUL_O) + target_sources(hacl_static PRIVATE $) +endif() + # Install # # This allows package maintainers to control the install destination by setting # # the appropriate cache variables. @@ -399,12 +447,13 @@ install(DIRECTORY vale/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/vale # # Install config.h install(FILES build/config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hacl) -# The CPU detection is used for testing and benchmarking -if(ENABLE_TESTS OR ENABLE_BENCHMARKS) - # CPU feature detection for tests - add_library(hacl_cpu_features OBJECT ${PROJECT_SOURCE_DIR}/cpu-features/src/cpu-features.c) - target_include_directories(hacl_cpu_features PUBLIC ${PROJECT_SOURCE_DIR}/cpu-features/include) -endif(ENABLE_TESTS OR ENABLE_BENCHMARKS) +# CPU feature detection for tests +add_library(hacl_cpu_features OBJECT ${PROJECT_SOURCE_DIR}/cpu-features/src/cpu-features.c) +target_include_directories(hacl_cpu_features PUBLIC ${PROJECT_SOURCE_DIR}/cpu-features/include) +add_dependencies(hacl hacl_cpu_features) +target_link_libraries(hacl PRIVATE $) +add_dependencies(hacl_static hacl_cpu_features) +target_link_libraries(hacl_static PRIVATE $) # Add ecckiila for benchmarks if(ENABLE_BENCHMARKS) @@ -426,6 +475,11 @@ if(ENABLE_BENCHMARKS) target_include_directories(digestif PUBLIC ${PROJECT_SOURCE_DIR}/third-party/digestif) endif(ENABLE_BENCHMARKS) +# Add bearssl for benchmarks +if(ENABLE_BENCHMARKS) + include(${PROJECT_SOURCE_DIR}/third-party/bearssl/config.cmake) + add_library(bearssl OBJECT ${SOURCES_bearssl}) +endif(ENABLE_BENCHMARKS) # Testing # It's only one binary. Everything else is done with gtest arguments. @@ -465,11 +519,10 @@ if(ENABLE_TESTS) target_compile_options(${TEST_NAME} PRIVATE /std:c++20) endif(MSVC) - add_dependencies(${TEST_NAME} hacl hacl_cpu_features) + add_dependencies(${TEST_NAME} hacl) target_link_libraries(${TEST_NAME} PRIVATE gtest_main hacl_static - hacl_cpu_features nlohmann_json::nlohmann_json ) target_include_directories(${TEST_NAME} PUBLIC ${PROJECT_SOURCE_DIR}/cpu-features/include) @@ -542,13 +595,13 @@ if(ENABLE_BENCHMARKS) target_compile_options(${BENCH_NAME} PRIVATE /std:c++20) endif(NOT MSVC) - add_dependencies(${BENCH_NAME} hacl hacl_cpu_features) + add_dependencies(${BENCH_NAME} hacl) target_link_libraries(${BENCH_NAME} PRIVATE hacl_static ecckiila blake2 digestif - hacl_cpu_features + bearssl benchmark::benchmark ) endforeach() diff --git a/benchmarks/aesgcm.cc b/benchmarks/aesgcm.cc new file mode 100644 index 00000000..edc41310 --- /dev/null +++ b/benchmarks/aesgcm.cc @@ -0,0 +1,262 @@ +/* + * Copyright 2023 Cryspen Sarl + * + * Licensed under the Apache License, Version 2.0 or MIT. + * - http://www.apache.org/licenses/LICENSE-2.0 + * - http://opensource.org/licenses/MIT + */ + +#include "util.h" + +#include "krml/internal/target.h" +#ifdef HACL_CAN_COMPILE_AESNI_PCLMUL +#include "Hacl_AES_128_GCM_NI.h" +#endif +#include "Hacl_AES_128_GCM_M32.h" +#include "EverCrypt_AEAD.h" +#include "../third-party/bearssl/bearssl_block.h" +#include "../third-party/bearssl/bearssl_hash.h" +#include "../third-party/bearssl/bearssl_aead.h" + +static bytes key(16, 7); +static bytes nonce(12, 9); +static bytes mac(16, 0); + +#ifdef HACL_CAN_COMPILE_AESNI_PCLMUL +static void +HACL_AES_128_GCM_NI_encrypt(benchmark::State& state) +{ + bytes plaintext(state.range(0), 0x37); + bytes ciphertext(state.range(0) + 16, 0); + + for (auto _ : state) { + Lib_IntVector_Intrinsics_vec128 *ctx = (Lib_IntVector_Intrinsics_vec128 *)KRML_HOST_CALLOC((uint32_t)352U, sizeof (uint8_t)); + Hacl_AES_128_GCM_NI_aes128_gcm_init(ctx, key.data()); + Hacl_AES_128_GCM_NI_aes128_gcm_encrypt(ctx, plaintext.size(), ciphertext.data(), plaintext.data(), 0, NULL, nonce.size(), nonce.data()); + KRML_HOST_FREE(ctx); + } +} + +BENCHMARK(HACL_AES_128_GCM_NI_encrypt)->Setup(DoSetup)->Apply(Range); + +static void +HACL_AES_128_GCM_NI_aad(benchmark::State& state) +{ + bytes aad(state.range(0), 0x37); + + for (auto _ : state) { + Lib_IntVector_Intrinsics_vec128 *ctx = (Lib_IntVector_Intrinsics_vec128 *)KRML_HOST_CALLOC((uint32_t)352U, sizeof (uint8_t)); + Hacl_AES_128_GCM_NI_aes128_gcm_init(ctx, key.data()); + Hacl_AES_128_GCM_NI_aes128_gcm_encrypt(ctx, 0, mac.data(), NULL, aad.size(), aad.data(), nonce.size(), nonce.data()); + KRML_HOST_FREE(ctx); + } +} + +BENCHMARK(HACL_AES_128_GCM_NI_aad)->Setup(DoSetup)->Apply(Range); +#endif + +static void +HACL_AES_128_GCM_M32_encrypt(benchmark::State& state) +{ + bytes plaintext(state.range(0), 0x37); + bytes ciphertext(state.range(0) + 16, 0); + + for (auto _ : state) { + uint64_t *ctx = (uint64_t *)KRML_HOST_CALLOC((uint32_t)3168U, sizeof (uint8_t)); + Hacl_AES_128_GCM_M32_aes128_gcm_init(ctx, key.data()); + Hacl_AES_128_GCM_M32_aes128_gcm_encrypt(ctx, plaintext.size(), ciphertext.data(), plaintext.data(), 0, NULL, nonce.size(), nonce.data()); + KRML_HOST_FREE(ctx); + } +} + +BENCHMARK(HACL_AES_128_GCM_M32_encrypt)->Setup(DoSetup)->Apply(Range); + +static void +HACL_AES_128_GCM_M32_aad(benchmark::State& state) +{ + bytes aad(state.range(0), 0x37); + + for (auto _ : state) { + uint64_t *ctx = (uint64_t *)KRML_HOST_CALLOC((uint32_t)3168U, sizeof (uint8_t)); + Hacl_AES_128_GCM_M32_aes128_gcm_init(ctx, key.data()); + Hacl_AES_128_GCM_M32_aes128_gcm_encrypt(ctx, 0, mac.data(), NULL, aad.size(), aad.data(), nonce.size(), nonce.data()); + KRML_HOST_FREE(ctx); + } +} + +BENCHMARK(HACL_AES_128_GCM_M32_aad)->Setup(DoSetup)->Apply(Range); + +static void +EverCrypt_AES128_GCM_encrypt(benchmark::State& state) +{ + bytes plaintext(state.range(0), 0x37); + bytes ciphertext(state.range(0), 0); + + for (auto _ : state) { + EverCrypt_AEAD_state_s* ctx; + EverCrypt_Error_error_code res = EverCrypt_AEAD_create_in( + Spec_Agile_AEAD_AES128_GCM, &ctx, key.data()); + + if (res != EverCrypt_Error_Success) { + state.SkipWithError("Could not allocate AEAD state."); + break; + } + + EverCrypt_AEAD_encrypt(ctx, + nonce.data(), + nonce.size(), + NULL, + 0, + plaintext.data(), + plaintext.size(), + ciphertext.data(), + mac.data()); + + EverCrypt_AEAD_free(ctx); + } +} + +BENCHMARK(EverCrypt_AES128_GCM_encrypt)->Setup(DoSetup)->Apply(Range); + +static void +EverCrypt_AES128_GCM_aad(benchmark::State& state) +{ + bytes aad(state.range(0), 0x37); + + for (auto _ : state) { + EverCrypt_AEAD_state_s* ctx; + EverCrypt_Error_error_code res = EverCrypt_AEAD_create_in( + Spec_Agile_AEAD_AES128_GCM, &ctx, key.data()); + + if (res != EverCrypt_Error_Success) { + state.SkipWithError("Could not allocate AEAD state."); + break; + } + + EverCrypt_AEAD_encrypt(ctx, + nonce.data(), + nonce.size(), + aad.data(), + aad.size(), + NULL, + 0, + NULL, + mac.data()); + + EverCrypt_AEAD_free(ctx); + } +} + +BENCHMARK(EverCrypt_AES128_GCM_aad)->Setup(DoSetup)->Apply(Range); + +#ifndef NO_OPENSSL +static void +OpenSSL_aes_128_gcm_encrypt(benchmark::State& state) +{ + bytes plaintext(state.range(0), 0x37); + bytes ciphertext(state.range(0), 0); + + for (auto _ : state) { + int out_len, unused_len; + EVP_CIPHER_CTX* ctx = EVP_CIPHER_CTX_new(); + int result = EVP_EncryptInit_ex2( + ctx, EVP_aes_128_gcm(), key.data(), nonce.data(), NULL); + if (result != 1) { + state.SkipWithError(""); + EVP_CIPHER_CTX_free(ctx); + break; + } + result = EVP_EncryptUpdate( + ctx, ciphertext.data(), &out_len, plaintext.data(), plaintext.size()); + if (result != 1) { + state.SkipWithError(""); + EVP_CIPHER_CTX_free(ctx); + break; + } + result = EVP_EncryptFinal_ex(ctx, mac.data(), &unused_len); + if (result != 1 || unused_len != 0) { + state.SkipWithError(""); + EVP_CIPHER_CTX_free(ctx); + break; + } + EVP_CIPHER_CTX_free(ctx); + } +} + +BENCHMARK(OpenSSL_aes_128_gcm_encrypt)->Setup(DoSetup)->Apply(Range); + +static void +OpenSSL_aes_128_gcm_aad(benchmark::State& state) +{ + bytes aad(state.range(0), 0x37); + + for (auto _ : state) { + int out_len, unused_len; + EVP_CIPHER_CTX* ctx = EVP_CIPHER_CTX_new(); + int result = EVP_EncryptInit_ex2( + ctx, EVP_aes_128_gcm(), key.data(), nonce.data(), NULL); + if (result != 1) { + state.SkipWithError(""); + EVP_CIPHER_CTX_free(ctx); + break; + } + result = EVP_EncryptUpdate( + ctx, NULL, &out_len, aad.data(), aad.size()); + if (result != 1) { + state.SkipWithError(""); + EVP_CIPHER_CTX_free(ctx); + break; + } + result = EVP_EncryptFinal_ex(ctx, mac.data(), &unused_len); + if (result != 1 || unused_len != 0) { + state.SkipWithError(""); + EVP_CIPHER_CTX_free(ctx); + break; + } + EVP_CIPHER_CTX_free(ctx); + } +} + +BENCHMARK(OpenSSL_aes_128_gcm_aad)->Setup(DoSetup)->Apply(Range); +#endif + +static void +BearSSL_CT64_AES128_GCM_encrypt(benchmark::State& state) +{ + bytes plaintext(state.range(0), 0x37); + + for (auto _ : state) { + br_aes_ct64_ctr_keys bc; + br_gcm_context gc; + br_aes_ct64_ctr_init(&bc, key.data(), key.size()); + br_gcm_init(&gc, &bc.vtable, br_ghash_ctmul64); + + br_gcm_reset(&gc, nonce.data(), nonce.size()); + br_gcm_flip(&gc); + br_gcm_run(&gc, 1, plaintext.data(), plaintext.size()); + br_gcm_get_tag(&gc, mac.data()); + } +} + +BENCHMARK(BearSSL_CT64_AES128_GCM_encrypt)->Setup(DoSetup)->Apply(Range); + +static void +BearSSL_CT64_AES128_GCM_aad(benchmark::State& state) +{ + bytes aad(state.range(0), 0x37); + + for (auto _ : state) { + br_aes_ct64_ctr_keys bc; + br_gcm_context gc; + br_aes_ct64_ctr_init(&bc, key.data(), key.size()); + br_gcm_init(&gc, &bc.vtable, br_ghash_ctmul64); + + br_gcm_reset(&gc, nonce.data(), nonce.size()); + br_gcm_aad_inject(&gc, aad.data(), aad.size()); + br_gcm_get_tag(&gc, mac.data()); + } +} + +BENCHMARK(BearSSL_CT64_AES128_GCM_aad)->Setup(DoSetup)->Apply(Range); + +BENCHMARK_MAIN(); diff --git a/config/Config.h.in b/config/Config.h.in index 05c5482b..d3777454 100644 --- a/config/Config.h.in +++ b/config/Config.h.in @@ -24,6 +24,7 @@ #cmakedefine HACL_CAN_COMPILE_VALE @HACL_CAN_COMPILE_VALE@ #cmakedefine HACL_CAN_COMPILE_VEC128 @HACL_CAN_COMPILE_VEC128@ #cmakedefine HACL_CAN_COMPILE_VEC256 @HACL_CAN_COMPILE_VEC256@ +#cmakedefine HACL_CAN_COMPILE_AESNI_PCLMUL @HACL_CAN_COMPILE_AESNI_PCLMUL@ #cmakedefine HACL_CAN_COMPILE_INLINE_ASM @HACL_CAN_COMPILE_INLINE_ASM@ #cmakedefine LINUX_NO_EXPLICIT_BZERO @LINUX_NO_EXPLICIT_BZERO@ diff --git a/config/config.json b/config/config.json index 06ff34fe..e06d1780 100644 --- a/config/config.json +++ b/config/config.json @@ -6,6 +6,9 @@ "vale_include_paths": [ "vale/include" ], + "cpu_features_include_paths": [ + "cpu-features/include" + ], "hacl_sources": { "nacl": [ { @@ -19,7 +22,36 @@ "features": "std" } ], - "aesgcm": [], + "aesgcm": [ + { + "file": "Hacl_Lib.c", + "features": "std" + }, + { + "file": "Hacl_Gf128_PreComp.c", + "features": "std" + }, + { + "file": "Hacl_AES_128_BitSlice.c", + "features": "std" + }, + { + "file": "Hacl_AES_128_GCM_M32.c", + "features": "std" + }, + { + "file": "Hacl_Gf128_NI.c", + "features": "aesni_pclmul" + }, + { + "file": "Hacl_AES_128_NI.c", + "features": "aesni_pclmul" + }, + { + "file": "Hacl_AES_128_GCM_NI.c", + "features": "aesni_pclmul" + } + ], "drbg": [ { "file": "Hacl_HMAC_DRBG.c", @@ -552,6 +584,9 @@ ], "rsapss": [ "rsapss.cc" + ], + "aesgcm": [ + "aesgcm.cc" ] } } diff --git a/config/default_config.cmake b/config/default_config.cmake index ee7e66ab..892bbad0 100644 --- a/config/default_config.cmake +++ b/config/default_config.cmake @@ -53,6 +53,10 @@ set(SOURCES_std ${PROJECT_SOURCE_DIR}/src/EverCrypt_Chacha20Poly1305.c ${PROJECT_SOURCE_DIR}/src/EverCrypt_Poly1305.c ${PROJECT_SOURCE_DIR}/src/EverCrypt_AEAD.c + ${PROJECT_SOURCE_DIR}/src/Hacl_Lib.c + ${PROJECT_SOURCE_DIR}/src/Hacl_Gf128_PreComp.c + ${PROJECT_SOURCE_DIR}/src/Hacl_AES_128_BitSlice.c + ${PROJECT_SOURCE_DIR}/src/Hacl_AES_128_GCM_M32.c ) set(SOURCES_vec256 ${PROJECT_SOURCE_DIR}/src/Hacl_Hash_Blake2b_256.c @@ -86,7 +90,6 @@ set(SOURCES_vec128 ${PROJECT_SOURCE_DIR}/src/Hacl_HPKE_P256_CP128_SHA256.c ) set(SOURCES_m32 - ) set(SOURCES_vale ${PROJECT_SOURCE_DIR}/src/Hacl_Curve25519_64.c @@ -103,6 +106,11 @@ set(SOURCES_std_vale ${PROJECT_SOURCE_DIR}/src/Hacl_HPKE_Curve64_CP32_SHA256.c ${PROJECT_SOURCE_DIR}/src/Hacl_HPKE_Curve64_CP32_SHA512.c ) +set(SOURCES_aesni_pclmul + ${PROJECT_SOURCE_DIR}/src/Hacl_Gf128_NI.c + ${PROJECT_SOURCE_DIR}/src/Hacl_AES_128_NI.c + ${PROJECT_SOURCE_DIR}/src/Hacl_AES_128_GCM_NI.c +) set(INCLUDES ${PROJECT_SOURCE_DIR}/include/Hacl_NaCl.h ${PROJECT_SOURCE_DIR}/karamel/include/krml/internal/types.h @@ -361,6 +369,14 @@ set(PUBLIC_INCLUDES ${PROJECT_SOURCE_DIR}/include/EverCrypt_Chacha20Poly1305.h ${PROJECT_SOURCE_DIR}/include/EverCrypt_Poly1305.h ${PROJECT_SOURCE_DIR}/include/EverCrypt_AEAD.h + ${PROJECT_SOURCE_DIR}/include/internal/Hacl_Lib.h + ${PROJECT_SOURCE_DIR}/include/internal/Hacl_AES_128_BitSlice.h + ${PROJECT_SOURCE_DIR}/include/Hacl_Gf128_PreComp.h + ${PROJECT_SOURCE_DIR}/include/Hacl_AES_128_BitSlice.h + ${PROJECT_SOURCE_DIR}/include/Hacl_AES_128_GCM_M32.h + ${PROJECT_SOURCE_DIR}/include/Hacl_Gf128_NI.h + ${PROJECT_SOURCE_DIR}/include/Hacl_AES_128_NI.h + ${PROJECT_SOURCE_DIR}/include/Hacl_AES_128_GCM_NI.h ) set(ALGORITHMS nacl @@ -392,6 +408,7 @@ set(INCLUDE_PATHS ${PROJECT_SOURCE_DIR}/karamel/include ${PROJECT_SOURCE_DIR}/karamel/krmllib/dist/minimal ${PROJECT_SOURCE_DIR}/vale/include + ${PROJECT_SOURCE_DIR}/cpu-features/include ) set(TEST_SOURCES ${PROJECT_SOURCE_DIR}/tests/detection.cc @@ -435,6 +452,7 @@ set(BENCHMARK_SOURCES ${PROJECT_SOURCE_DIR}/benchmarks/drbg.cc ${PROJECT_SOURCE_DIR}/benchmarks/hmac.cc ${PROJECT_SOURCE_DIR}/benchmarks/rsapss.cc + ${PROJECT_SOURCE_DIR}/benchmarks/aesgcm.cc ) set(VALE_SOURCES_osx ${PROJECT_SOURCE_DIR}/vale/src/cpuid-x86_64-darwin.S diff --git a/config/toolchain.cmake b/config/toolchain.cmake index 2877034f..51de77e4 100644 --- a/config/toolchain.cmake +++ b/config/toolchain.cmake @@ -91,6 +91,15 @@ if(NOT DEFINED TOOLCHAIN_CAN_COMPILE_VALE) endif() endif() +## Check for aes-ni/pclmul support +if(NOT DEFINED TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL) + # Always enable for x64 and arm64 + set(TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL FALSE) + if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64|aarch64|arm64|arm64v8") + set(TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL TRUE) + endif() +endif() + # Check for inline assembly support if(NOT DEFINED TOOLCHAIN_CAN_COMPILE_INLINE_ASM) set(TOOLCHAIN_CAN_COMPILE_INLINE_ASM OFF) diff --git a/cpu-features/include/hacl-cpu-features.h b/cpu-features/include/hacl-cpu-features.h index 221a1100..7e87d470 100644 --- a/cpu-features/include/hacl-cpu-features.h +++ b/cpu-features/include/hacl-cpu-features.h @@ -20,6 +20,8 @@ extern "C" unsigned int hacl_vec256_support(); + unsigned int hacl_aesgcm_support(); + unsigned int vale_aesgcm_support(); unsigned int vale_x25519_support(); diff --git a/cpu-features/src/cpu-features.c b/cpu-features/src/cpu-features.c index f306b549..f22de709 100644 --- a/cpu-features/src/cpu-features.c +++ b/cpu-features/src/cpu-features.c @@ -35,6 +35,28 @@ #error "Unsupported OS" #endif +#include +#if defined(CPU_FEATURES_LINUX) && defined(CPU_FEATURES_ARM64) && \ + defined(__GLIBC__) && defined(__GLIBC_PREREQ) +#if __GLIBC_PREREQ(2, 16) +#include +#include +#define GETAUXVAL_FUNC +#ifndef HWCAP_ASIMD +#define HWCAP_ASIMD (1 << 1) +#endif +#ifndef HWCAP_AES +#define HWCAP_AES (1 << 3) +#endif +#ifndef HWCAP_PMULL +#define HWCAP_PMULL (1 << 4) +#endif +#ifndef HWCAP_SHA2 +#define HWCAP_SHA2 (1 << 6) +#endif +#endif +#endif + // === x86 | x64 #if (defined(CPU_FEATURES_LINUX) || defined(CPU_FEATURES_MACOS)) && \ @@ -116,6 +138,8 @@ static unsigned int _bmi2 = 0; static unsigned int _pclmul = 0; static unsigned int _movbe = 0; static unsigned int _cmov = 0; +// AArch64-specific variables +static unsigned int _asimd = 0; // API @@ -124,7 +148,9 @@ hacl_vec128_support() { #if defined(CPU_FEATURES_X64) || defined(CPU_FEATURES_X86) return _sse && _sse2 && _sse3 && _sse41 && _sse41 && _cmov; -#elif defined(CPU_FEATURES_ARM64) || defined(CPU_FEATURES_POWERZ) +#elif defined(CPU_FEATURES_ARM64) + return _asimd; +#elif defined(CPU_FEATURES_POWERZ) return 1; #else return 0; @@ -137,6 +163,12 @@ hacl_vec256_support() return _avx && _avx2; } +unsigned int +hacl_aesgcm_support() +{ + return hacl_vec128_support() && _aes && _pclmul; +} + unsigned int vale_aesgcm_support() { @@ -186,15 +218,37 @@ hacl_init_cpu_features() _sse42 = (ecx & ECX_SSE4_2) != 0; #endif +#if defined(CPU_FEATURES_LINUX) && defined(CPU_FEATURES_ARM64) && \ + defined(GETAUXVAL_FUNC) + unsigned long hwcap = getauxval(AT_HWCAP); + _asimd = ((hwcap & HWCAP_ASIMD) != 0) ? 1 : 0; + _aes = ((hwcap & HWCAP_AES) != 0) ? 1 : 0; + _pclmul = ((hwcap & HWCAP_PMULL) != 0) ? 1 : 0; + _sha = ((hwcap & HWCAP_SHA2) != 0) ? 1 : 0; +#endif + #if defined(CPU_FEATURES_MACOS) && defined(CPU_FEATURES_ARM64) + int err; int64_t ret = 0; size_t size = sizeof(ret); - sysctlbyname("hw.optional.neon", &ret, &size, NULL, 0); - if (ret == 1) { - _aes = 1; - _sha = 1; - } + err = sysctlbyname("hw.optional.AdvSIMD", &ret, &size, NULL, 0); + _asimd = (err == 0 && ret > 0) ? 1 : 0; + + ret = 0; + size = sizeof(ret); + err = sysctlbyname("hw.optional.arm.FEAT_AES", &ret, &size, NULL, 0); + _aes = (err == 0 && ret > 0) ? 1 : 0; + + ret = 0; + size = sizeof(ret); + err = sysctlbyname("hw.optional.arm.FEAT_PMULL", &ret, &size, NULL, 0); + _pclmul = (err == 0 && ret > 0) ? 1 : 0; + + ret = 0; + size = sizeof(ret); + err = sysctlbyname("hw.optional.arm.FEAT_SHA256", &ret, &size, NULL, 0); + _sha = (err == 0 && ret > 0) ? 1 : 0; #endif } diff --git a/include/Hacl_AES_128_BitSlice.h b/include/Hacl_AES_128_BitSlice.h new file mode 100644 index 00000000..3a146a89 --- /dev/null +++ b/include/Hacl_AES_128_BitSlice.h @@ -0,0 +1,73 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_128_BitSlice_H +#define __Hacl_AES_128_BitSlice_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +typedef uint64_t *Hacl_AES_128_BitSlice_aes_ctx; + +typedef uint8_t *Hacl_AES_128_BitSlice_skey; + +void Hacl_AES_128_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce); + +void Hacl_AES_128_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce); + +void Hacl_AES_128_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter); + +void +Hacl_AES_128_BitSlice_aes128_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +void +Hacl_AES_128_BitSlice_aes128_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_128_BitSlice_H_DEFINED +#endif diff --git a/include/Hacl_AES_128_GCM_M32.h b/include/Hacl_AES_128_GCM_M32.h new file mode 100644 index 00000000..29125377 --- /dev/null +++ b/include/Hacl_AES_128_GCM_M32.h @@ -0,0 +1,76 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_128_GCM_M32_H +#define __Hacl_AES_128_GCM_M32_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "Hacl_Gf128_PreComp.h" +#include "Hacl_AES_128_BitSlice.h" + +extern uint32_t Hacl_AES_128_GCM_M32_aes_gcm_ctx_len; + +typedef uint64_t *Hacl_AES_128_GCM_M32_aes_gcm_ctx; + +void Hacl_AES_128_GCM_M32_aes128_gcm_init(uint64_t *ctx, uint8_t *key); + +void +Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +bool +Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_128_GCM_M32_H_DEFINED +#endif diff --git a/include/Hacl_AES_128_GCM_NI.h b/include/Hacl_AES_128_GCM_NI.h new file mode 100644 index 00000000..ab520316 --- /dev/null +++ b/include/Hacl_AES_128_GCM_NI.h @@ -0,0 +1,75 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_128_GCM_NI_H +#define __Hacl_AES_128_GCM_NI_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "Hacl_Gf128_NI.h" +#include "Hacl_AES_128_NI.h" +#include "libintvector.h" + +typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_128_GCM_NI_aes_gcm_ctx; + +void Hacl_AES_128_GCM_NI_aes128_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key); + +void +Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +bool +Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_128_GCM_NI_H_DEFINED +#endif diff --git a/include/Hacl_AES_128_NI.h b/include/Hacl_AES_128_NI.h new file mode 100644 index 00000000..f3c148b5 --- /dev/null +++ b/include/Hacl_AES_128_NI.h @@ -0,0 +1,81 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_128_NI_H +#define __Hacl_AES_128_NI_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "libintvector.h" + +typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_128_NI_aes_ctx; + +typedef uint8_t *Hacl_AES_128_NI_skey; + +void +Hacl_AES_128_NI_aes128_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key, uint8_t *nonce); + +void Hacl_AES_128_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce); + +void +Hacl_AES_128_NI_aes128_key_block( + uint8_t *kb, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t counter +); + +void +Hacl_AES_128_NI_aes128_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +void +Hacl_AES_128_NI_aes128_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_128_NI_H_DEFINED +#endif diff --git a/include/Hacl_Gf128_NI.h b/include/Hacl_Gf128_NI.h new file mode 100644 index 00000000..46322a32 --- /dev/null +++ b/include/Hacl_Gf128_NI.h @@ -0,0 +1,65 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_Gf128_NI_H +#define __Hacl_Gf128_NI_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "libintvector.h" + +void Hacl_Gf128_NI_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key); + +void +Hacl_Gf128_NI_gcm_update_blocks( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *text +); + +extern void +(*Hacl_Gf128_NI_gcm_update_padded)( + Lib_IntVector_Intrinsics_vec128 *x0, + uint32_t x1, + uint8_t *x2 +); + +void Hacl_Gf128_NI_gcm_emit(uint8_t *tag, Lib_IntVector_Intrinsics_vec128 *ctx); + +void Hacl_Gf128_NI_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_Gf128_NI_H_DEFINED +#endif diff --git a/include/Hacl_Gf128_PreComp.h b/include/Hacl_Gf128_PreComp.h new file mode 100644 index 00000000..3d67add1 --- /dev/null +++ b/include/Hacl_Gf128_PreComp.h @@ -0,0 +1,54 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_Gf128_PreComp_H +#define __Hacl_Gf128_PreComp_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +void Hacl_Gf128_PreComp_gcm_init(uint64_t *ctx, uint8_t *key); + +void Hacl_Gf128_PreComp_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text); + +extern void +(*Hacl_Gf128_PreComp_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2); + +void Hacl_Gf128_PreComp_gcm_emit(uint8_t *tag, uint64_t *ctx); + +void Hacl_Gf128_PreComp_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_Gf128_PreComp_H_DEFINED +#endif diff --git a/include/internal/Hacl_AES_128_BitSlice.h b/include/internal/Hacl_AES_128_BitSlice.h new file mode 100644 index 00000000..3b95bb9b --- /dev/null +++ b/include/internal/Hacl_AES_128_BitSlice.h @@ -0,0 +1,83 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __internal_Hacl_AES_128_BitSlice_H +#define __internal_Hacl_AES_128_BitSlice_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "internal/Hacl_Lib.h" +#include "../Hacl_AES_128_BitSlice.h" + +void Hacl_Impl_AES_CoreBitSlice_store_block0(uint8_t *out, uint64_t *inp); + +void Hacl_Impl_AES_CoreBitSlice_load_key1(uint64_t *out, uint8_t *k); + +void Hacl_Impl_AES_CoreBitSlice_load_nonce(uint64_t *out, uint8_t *nonce1); + +void Hacl_Impl_AES_CoreBitSlice_load_state(uint64_t *out, uint64_t *nonce1, uint32_t counter); + +void Hacl_Impl_AES_CoreBitSlice_xor_state_key1(uint64_t *st, uint64_t *ost); + +void Hacl_Impl_AES_CoreBitSlice_aes_enc(uint64_t *st, uint64_t *key); + +void Hacl_Impl_AES_CoreBitSlice_aes_enc_last(uint64_t *st, uint64_t *key); + +void +Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(uint64_t *next, uint64_t *prev, uint8_t rcon1); + +void Hacl_Impl_AES_CoreBitSlice_key_expansion_step(uint64_t *next, uint64_t *prev); + +void +Hacl_Impl_AES_Generic_aes128_ctr_bitslice( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t counter +); + +void +Hacl_Impl_AES_Generic_aes256_ctr_bitslice( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t counter +); + +#if defined(__cplusplus) +} +#endif + +#define __internal_Hacl_AES_128_BitSlice_H_DEFINED +#endif diff --git a/include/internal/Hacl_Lib.h b/include/internal/Hacl_Lib.h new file mode 100644 index 00000000..61b523ff --- /dev/null +++ b/include/internal/Hacl_Lib.h @@ -0,0 +1,69 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __internal_Hacl_Lib_H +#define __internal_Hacl_Lib_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +typedef struct Lib_Transposition64x8_uint64x2_s +{ + uint64_t fst; + uint64_t snd; +} +Lib_Transposition64x8_uint64x2; + +typedef struct Lib_Transposition64x8_uint64x4_s +{ + Lib_Transposition64x8_uint64x2 fst; + Lib_Transposition64x8_uint64x2 snd; +} +Lib_Transposition64x8_uint64x4; + +typedef struct Lib_Transposition64x8_uint64x8_s +{ + Lib_Transposition64x8_uint64x4 fst; + Lib_Transposition64x8_uint64x4 snd; +} +Lib_Transposition64x8_uint64x8; + +uint64_t Lib_Transposition64x8_transpose_bits64(uint64_t x); + +Lib_Transposition64x8_uint64x8 +Lib_Transposition64x8_transpose_bits64x8(Lib_Transposition64x8_uint64x8 a); + +#if defined(__cplusplus) +} +#endif + +#define __internal_Hacl_Lib_H_DEFINED +#endif diff --git a/include/internal/Hacl_Spec.h b/include/internal/Hacl_Spec.h index fee56d84..e3ddd18f 100644 --- a/include/internal/Hacl_Spec.h +++ b/include/internal/Hacl_Spec.h @@ -40,6 +40,10 @@ extern "C" { #define Spec_Cipher_Expansion_Hacl_CHACHA20 0 #define Spec_Cipher_Expansion_Vale_AES128 1 #define Spec_Cipher_Expansion_Vale_AES256 2 +#define Spec_Cipher_Expansion_AESNI_PCLMUL_AES128 3 +#define Spec_Cipher_Expansion_AESNI_PCLMUL_AES256 4 +#define Spec_Cipher_Expansion_M32_AES128 5 +#define Spec_Cipher_Expansion_M32_AES256 6 typedef uint8_t Spec_Cipher_Expansion_impl; diff --git a/include/libintvector.h b/include/libintvector.h index 99d11336..4c259dba 100644 --- a/include/libintvector.h +++ b/include/libintvector.h @@ -121,12 +121,18 @@ typedef __m128i Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_load64_le(x0) \ (_mm_loadu_si128((__m128i*)(x0))) +#define Lib_IntVector_Intrinsics_vec128_load128_le(x0) \ + (_mm_loadu_si128((__m128i*)(x0))) + #define Lib_IntVector_Intrinsics_vec128_store32_le(x0, x1) \ (_mm_storeu_si128((__m128i*)(x0), x1)) #define Lib_IntVector_Intrinsics_vec128_store64_le(x0, x1) \ (_mm_storeu_si128((__m128i*)(x0), x1)) +#define Lib_IntVector_Intrinsics_vec128_store128_le(x0, x1) \ + (_mm_storeu_si128((__m128i*)(x0), x1)) + #define Lib_IntVector_Intrinsics_vec128_load_be(x0) \ (_mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(x0)), _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))) @@ -456,6 +462,30 @@ typedef __m256i Lib_IntVector_Intrinsics_vec256; typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; +#if defined(__ARM_FEATURE_AES) + +#define Lib_IntVector_Intrinsics_ni_aes_enc(x0, x1) \ + ((uint32x4_t)(vaesmcq_u8(vaeseq_u8((uint8x16_t)x0, (uint8x16_t){})) ^ (uint8x16_t)x1)) + +#define Lib_IntVector_Intrinsics_ni_aes_enc_last(x0, x1) \ + ((uint32x4_t)(vaeseq_u8((uint8x16_t)x0, (uint8x16_t){}) ^ (uint8x16_t)x1)) + +static inline Lib_IntVector_Intrinsics_vec128 Lib_IntVector_Intrinsics_ni_aes_keygen_assist (Lib_IntVector_Intrinsics_vec128 x0, uint8_t x1){ + uint8x16_t tmp = vaeseq_u8((uint8x16_t)x0, (uint8x16_t){}); + return (uint32x4_t)((uint8x16_t){ + tmp[4], tmp[1], tmp[14], tmp[11], + tmp[1], tmp[14], tmp[11], tmp[4], + tmp[12], tmp[9], tmp[6], tmp[3], + tmp[9], tmp[6], tmp[3], tmp[12] + } ^ (uint8x16_t)(uint32x4_t){0, x1, 0, x1}); +} + +#define Lib_IntVector_Intrinsics_ni_clmul(x0, x1, x2) \ + ((x2) == 0x11? (uint32x4_t)vmull_high_p64((poly64x2_t)x0, (poly64x2_t)x1) : \ + (uint32x4_t)vmull_p64(vgetq_lane_u64((uint64x2_t)x0,(x2)&1), vgetq_lane_u64((uint64x2_t)x1,(x2)>>4))) + +#endif + #define Lib_IntVector_Intrinsics_vec128_xor(x0, x1) \ (veorq_u32(x0,x1)) @@ -486,12 +516,11 @@ typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_lognot(x0) \ (vmvnq_u32(x0)) - #define Lib_IntVector_Intrinsics_vec128_shift_left(x0, x1) \ - (vextq_u32(x0, vdupq_n_u8(0), 16-(x1)/8)) + ((uint32x4_t)vextq_u8(vdupq_n_u8(0), (uint8x16_t)x0, 16-(x1)/8)) #define Lib_IntVector_Intrinsics_vec128_shift_right(x0, x1) \ - (vextq_u32(x0, vdupq_n_u8(0), (x1)/8)) + ((uint32x4_t)vextq_u8((uint8x16_t)x0, vdupq_n_u8(0), (x1)/8)) #define Lib_IntVector_Intrinsics_vec128_shift_left64(x0, x1) \ (vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x0), x1))) @@ -525,11 +554,10 @@ typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_rotate_right_lanes64(x0, x1) \ (vextq_u64(x0,x0,x1)) - -/* #define Lib_IntVector_Intrinsics_vec128_shuffle32(x0, x1, x2, x3, x4) \ - (_mm_shuffle_epi32(x0, _MM_SHUFFLE(x1,x2,x3,x4))) + ((uint32x4_t){((uint32x4_t)x0)[x1],((uint32x4_t)x0)[x2],((uint32x4_t)x0)[x3],((uint32x4_t)x0)[x4]}) +/* #define Lib_IntVector_Intrinsics_vec128_shuffle64(x0, x1, x2) \ (_mm_shuffle_epi32(x0, _MM_SHUFFLE(2*x1+1,2*x1,2*x2+1,2*x2))) */ @@ -540,17 +568,17 @@ typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_load64_le(x0) \ (vld1q_u32((const uint32_t*) (x0))) +#define Lib_IntVector_Intrinsics_vec128_load128_le(x0) \ + ((uint32x4_t)vld1q_u8((uint8_t*)x0)) + #define Lib_IntVector_Intrinsics_vec128_store32_le(x0, x1) \ (vst1q_u32((uint32_t*)(x0),(x1))) #define Lib_IntVector_Intrinsics_vec128_store64_le(x0, x1) \ (vst1q_u32((uint32_t*)(x0),(x1))) -/* -#define Lib_IntVector_Intrinsics_vec128_load_be(x0) \ - ( Lib_IntVector_Intrinsics_vec128 l = vrev64q_u8(vld1q_u32((uint32_t*)(x0))); - -*/ +#define Lib_IntVector_Intrinsics_vec128_store128_le(x0, x1) \ + (vst1q_u8((uint8_t*)x0, (uint8x16_t)x1)) #define Lib_IntVector_Intrinsics_vec128_load32_be(x0) \ (vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t*)(x0)))))) @@ -558,10 +586,10 @@ typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_load64_be(x0) \ (vreinterpretq_u32_u8(vrev64q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t*)(x0)))))) -/* -#define Lib_IntVector_Intrinsics_vec128_store_be(x0, x1) \ - (_mm_storeu_si128((__m128i*)(x0), _mm_shuffle_epi8(x1, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)))) -*/ +static inline Lib_IntVector_Intrinsics_vec128 Lib_IntVector_Intrinsics_vec128_load_be(uint8_t* x0){ + uint64x2_t l = (uint64x2_t)vrev64q_u8(vld1q_u8(x0)); + return (uint32x4_t)vextq_u64(l, l, 1); +} #define Lib_IntVector_Intrinsics_vec128_store32_be(x0, x1) \ (vst1q_u32((uint32_t*)(x0),(vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x1)))))) @@ -569,6 +597,11 @@ typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_store64_be(x0, x1) \ (vst1q_u32((uint32_t*)(x0),(vreinterpretq_u32_u8(vrev64q_u8(vreinterpretq_u8_u32(x1)))))) +static inline void Lib_IntVector_Intrinsics_vec128_store_be(uint8_t* x0, Lib_IntVector_Intrinsics_vec128 x1){ + uint64x2_t l = (uint64x2_t)vrev64q_u8((uint8x16_t)x1); + vst1q_u8(x0, (uint8x16_t)vextq_u64(l, l, 1)); +} + #define Lib_IntVector_Intrinsics_vec128_insert8(x0, x1, x2) \ (vsetq_lane_u8(x1,x0,x2)) diff --git a/src/EverCrypt_AEAD.c b/src/EverCrypt_AEAD.c index a4b306b7..6c21c319 100644 --- a/src/EverCrypt_AEAD.c +++ b/src/EverCrypt_AEAD.c @@ -26,8 +26,13 @@ #include "EverCrypt_AEAD.h" #include "internal/Vale.h" +#ifdef HACL_CAN_COMPILE_AESNI_PCLMUL +#include "Hacl_AES_128_GCM_NI.h" +#endif +#include "Hacl_AES_128_GCM_M32.h" #include "internal/Hacl_Spec.h" #include "config.h" +#include "hacl-cpu-features.h" /** Both encryption and decryption require a state that holds the key. @@ -63,10 +68,14 @@ Spec_Agile_AEAD_alg EverCrypt_AEAD_alg_of_state(EverCrypt_AEAD_state_s *s) return Spec_Agile_AEAD_CHACHA20_POLY1305; } case Spec_Cipher_Expansion_Vale_AES128: + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + case Spec_Cipher_Expansion_M32_AES128: { return Spec_Agile_AEAD_AES128_GCM; } case Spec_Cipher_Expansion_Vale_AES256: + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES256: + case Spec_Cipher_Expansion_M32_AES256: { return Spec_Agile_AEAD_AES256_GCM; } @@ -93,12 +102,12 @@ create_in_chacha20_poly1305(EverCrypt_AEAD_state_s **dst, uint8_t *k) static EverCrypt_Error_error_code create_in_aes128_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) { + #if HACL_CAN_COMPILE_VALE bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)480U, sizeof (uint8_t)); @@ -112,8 +121,31 @@ create_in_aes128_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) *dst = p; return EverCrypt_Error_Success; } + else + #elif HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)352U, sizeof (uint8_t)); + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx, k); + EverCrypt_AEAD_state_s + *p = (EverCrypt_AEAD_state_s *)KRML_HOST_MALLOC(sizeof (EverCrypt_AEAD_state_s)); + p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES128, .ek = ek }); + *dst = p; + return EverCrypt_Error_Success; + } + else #endif - return EverCrypt_Error_UnsupportedAlgorithm; + { + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)3168U, sizeof (uint8_t)); + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + Hacl_AES_128_GCM_M32_aes128_gcm_init(aes_gcm_ctx, k); + EverCrypt_AEAD_state_s + *p = (EverCrypt_AEAD_state_s *)KRML_HOST_MALLOC(sizeof (EverCrypt_AEAD_state_s)); + p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_M32_AES128, .ek = ek }); + *dst = p; + return EverCrypt_Error_Success; + } } static EverCrypt_Error_error_code @@ -306,6 +338,78 @@ encrypt_aes128_gcm( #endif } +static EverCrypt_Error_error_code +encrypt_aes128_gcm_aesni_pclmul( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *plain, + uint32_t plain_len, + uint8_t *cipher, + uint8_t *tag +) +{ + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_NI_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + return EverCrypt_Error_Success; + #else + KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", + __FILE__, + __LINE__, + "statically unreachable"); + KRML_HOST_EXIT(255U); + #endif +} + +static EverCrypt_Error_error_code +encrypt_aes128_gcm_m32( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *plain, + uint32_t plain_len, + uint8_t *cipher, + uint8_t *tag +) +{ + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_M32_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + return EverCrypt_Error_Success; +} + static EverCrypt_Error_error_code encrypt_aes256_gcm( EverCrypt_AEAD_state_s *s, @@ -488,6 +592,14 @@ EverCrypt_AEAD_encrypt( EverCrypt_Chacha20Poly1305_aead_encrypt(ek, iv, ad_len, ad, plain_len, plain, cipher, tag); return EverCrypt_Error_Success; } + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + { + return encrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } + case Spec_Cipher_Expansion_M32_AES128: + { + return encrypt_aes128_gcm_m32(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } default: { KRML_HOST_EPRINTF("KaRaMeL incomplete match at %s:%d\n", __FILE__, __LINE__); @@ -1282,6 +1394,86 @@ decrypt_aes128_gcm( #endif } +static EverCrypt_Error_error_code +decrypt_aes128_gcm_aesni_pclmul( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *cipher, + uint32_t cipher_len, + uint8_t *tag, + uint8_t *dst +) +{ + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_NI_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + #else + KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", + __FILE__, + __LINE__, + "statically unreachable"); + KRML_HOST_EXIT(255U); + #endif +} + +static EverCrypt_Error_error_code +decrypt_aes128_gcm_m32( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *cipher, + uint32_t cipher_len, + uint8_t *tag, + uint8_t *dst +) +{ + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_M32_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; +} + static EverCrypt_Error_error_code decrypt_aes256_gcm( EverCrypt_AEAD_state_s *s, @@ -1512,6 +1704,14 @@ EverCrypt_AEAD_decrypt( { return decrypt_chacha20_poly1305(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + { + return decrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + } + case Spec_Cipher_Expansion_M32_AES128: + { + return decrypt_aes128_gcm_m32(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + } default: { KRML_HOST_EPRINTF("KaRaMeL incomplete match at %s:%d\n", __FILE__, __LINE__); diff --git a/src/Hacl_AES_128_BitSlice.c b/src/Hacl_AES_128_BitSlice.c new file mode 100644 index 00000000..a0d2938f --- /dev/null +++ b/src/Hacl_AES_128_BitSlice.c @@ -0,0 +1,1105 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "internal/Hacl_AES_128_BitSlice.h" + +#include "internal/Hacl_Lib.h" + +typedef struct __uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_s +{ + uint64_t fst; + uint64_t snd; + uint64_t thd; + uint64_t f3; + uint64_t f4; + uint64_t f5; + uint64_t f6; + uint64_t f7; +} +__uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t; + +static __uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t +sub_bytes64x8( + uint64_t st0, + uint64_t st1, + uint64_t st2, + uint64_t st3, + uint64_t st4, + uint64_t st5, + uint64_t st6, + uint64_t st7 +) +{ + uint64_t input[8U] = { 0U }; + input[0U] = st0; + input[1U] = st1; + input[2U] = st2; + input[3U] = st3; + input[4U] = st4; + input[5U] = st5; + input[6U] = st6; + input[7U] = st7; + uint64_t output[8U] = { 0U }; + uint64_t tmp[121U] = { 0U }; + tmp[0U] = input[7U]; + tmp[1U] = input[6U]; + tmp[2U] = input[5U]; + tmp[3U] = input[4U]; + tmp[4U] = input[3U]; + tmp[5U] = input[2U]; + tmp[6U] = input[1U]; + tmp[7U] = input[0U]; + tmp[8U] = tmp[3U] ^ tmp[5U]; + tmp[9U] = tmp[0U] ^ tmp[6U]; + tmp[10U] = tmp[0U] ^ tmp[3U]; + tmp[11U] = tmp[0U] ^ tmp[5U]; + tmp[12U] = tmp[1U] ^ tmp[2U]; + tmp[13U] = tmp[12U] ^ tmp[7U]; + tmp[14U] = tmp[13U] ^ tmp[3U]; + tmp[15U] = tmp[9U] ^ tmp[8U]; + tmp[16U] = tmp[13U] ^ tmp[0U]; + tmp[17U] = tmp[13U] ^ tmp[6U]; + tmp[18U] = tmp[17U] ^ tmp[11U]; + tmp[19U] = tmp[4U] ^ tmp[15U]; + tmp[20U] = tmp[19U] ^ tmp[5U]; + tmp[21U] = tmp[19U] ^ tmp[1U]; + tmp[22U] = tmp[20U] ^ tmp[7U]; + tmp[23U] = tmp[20U] ^ tmp[12U]; + tmp[24U] = tmp[21U] ^ tmp[10U]; + tmp[25U] = tmp[7U] ^ tmp[24U]; + tmp[26U] = tmp[23U] ^ tmp[24U]; + tmp[27U] = tmp[23U] ^ tmp[11U]; + tmp[28U] = tmp[12U] ^ tmp[24U]; + tmp[29U] = tmp[9U] ^ tmp[28U]; + tmp[30U] = tmp[0U] ^ tmp[28U]; + tmp[31U] = tmp[15U] & tmp[20U]; + tmp[32U] = tmp[18U] & tmp[22U]; + tmp[33U] = tmp[32U] ^ tmp[31U]; + tmp[34U] = tmp[14U] & tmp[7U]; + tmp[35U] = tmp[34U] ^ tmp[31U]; + tmp[36U] = tmp[9U] & tmp[28U]; + tmp[37U] = tmp[17U] & tmp[13U]; + tmp[38U] = tmp[37U] ^ tmp[36U]; + tmp[39U] = tmp[16U] & tmp[25U]; + tmp[40U] = tmp[39U] ^ tmp[36U]; + tmp[41U] = tmp[10U] & tmp[24U]; + tmp[42U] = tmp[8U] & tmp[26U]; + tmp[43U] = tmp[42U] ^ tmp[41U]; + tmp[44U] = tmp[11U] & tmp[23U]; + tmp[45U] = tmp[44U] ^ tmp[41U]; + tmp[46U] = tmp[33U] ^ tmp[21U]; + tmp[47U] = tmp[35U] ^ tmp[45U]; + tmp[48U] = tmp[38U] ^ tmp[43U]; + tmp[49U] = tmp[40U] ^ tmp[45U]; + tmp[50U] = tmp[46U] ^ tmp[43U]; + tmp[51U] = tmp[47U] ^ tmp[27U]; + tmp[52U] = tmp[48U] ^ tmp[29U]; + tmp[53U] = tmp[49U] ^ tmp[30U]; + tmp[54U] = tmp[50U] ^ tmp[51U]; + tmp[55U] = tmp[50U] & tmp[52U]; + tmp[56U] = tmp[53U] ^ tmp[55U]; + tmp[57U] = tmp[54U] & tmp[56U]; + tmp[58U] = tmp[57U] ^ tmp[51U]; + tmp[59U] = tmp[52U] ^ tmp[53U]; + tmp[60U] = tmp[51U] ^ tmp[55U]; + tmp[61U] = tmp[60U] & tmp[59U]; + tmp[62U] = tmp[61U] ^ tmp[53U]; + tmp[63U] = tmp[52U] ^ tmp[62U]; + tmp[64U] = tmp[56U] ^ tmp[62U]; + tmp[65U] = tmp[53U] & tmp[64U]; + tmp[66U] = tmp[65U] ^ tmp[63U]; + tmp[67U] = tmp[56U] ^ tmp[65U]; + tmp[68U] = tmp[58U] & tmp[67U]; + tmp[69U] = tmp[54U] ^ tmp[68U]; + tmp[70U] = tmp[69U] ^ tmp[66U]; + tmp[71U] = tmp[58U] ^ tmp[62U]; + tmp[72U] = tmp[58U] ^ tmp[69U]; + tmp[73U] = tmp[62U] ^ tmp[66U]; + tmp[74U] = tmp[71U] ^ tmp[70U]; + tmp[75U] = tmp[73U] & tmp[20U]; + tmp[76U] = tmp[66U] & tmp[22U]; + tmp[77U] = tmp[62U] & tmp[7U]; + tmp[78U] = tmp[72U] & tmp[28U]; + tmp[79U] = tmp[69U] & tmp[13U]; + tmp[80U] = tmp[58U] & tmp[25U]; + tmp[81U] = tmp[71U] & tmp[24U]; + tmp[82U] = tmp[74U] & tmp[26U]; + tmp[83U] = tmp[70U] & tmp[23U]; + tmp[84U] = tmp[73U] & tmp[15U]; + tmp[85U] = tmp[66U] & tmp[18U]; + tmp[86U] = tmp[62U] & tmp[14U]; + tmp[87U] = tmp[72U] & tmp[9U]; + tmp[88U] = tmp[69U] & tmp[17U]; + tmp[89U] = tmp[58U] & tmp[16U]; + tmp[90U] = tmp[71U] & tmp[10U]; + tmp[91U] = tmp[74U] & tmp[8U]; + tmp[92U] = tmp[70U] & tmp[11U]; + tmp[93U] = tmp[90U] ^ tmp[91U]; + tmp[94U] = tmp[85U] ^ tmp[93U]; + tmp[95U] = tmp[84U] ^ tmp[94U]; + tmp[96U] = tmp[75U] ^ tmp[77U]; + tmp[97U] = tmp[76U] ^ tmp[75U]; + tmp[98U] = tmp[78U] ^ tmp[79U]; + tmp[99U] = tmp[87U] ^ tmp[96U]; + tmp[100U] = tmp[82U] ^ tmp[98U]; + tmp[101U] = tmp[83U] ^ tmp[99U]; + tmp[102U] = tmp[100U] ^ tmp[101U]; + tmp[103U] = tmp[98U] ^ tmp[97U]; + tmp[104U] = tmp[78U] ^ tmp[80U]; + tmp[105U] = tmp[88U] ^ tmp[93U]; + tmp[106U] = tmp[96U] ^ tmp[104U]; + tmp[107U] = tmp[95U] ^ tmp[103U]; + tmp[108U] = tmp[81U] ^ tmp[100U]; + tmp[109U] = tmp[89U] ^ tmp[102U]; + tmp[110U] = tmp[105U] ^ tmp[106U]; + uint64_t uu____0 = tmp[87U]; + uint64_t uu____1 = tmp[110U]; + tmp[111U] = (~uu____0 & ~uu____1) | (uu____0 & uu____1); + tmp[112U] = tmp[90U] ^ tmp[108U]; + tmp[113U] = tmp[94U] ^ tmp[86U]; + tmp[114U] = tmp[95U] ^ tmp[108U]; + uint64_t uu____2 = tmp[102U]; + uint64_t uu____3 = tmp[110U]; + tmp[115U] = (~uu____2 & ~uu____3) | (uu____2 & uu____3); + tmp[116U] = tmp[106U] ^ tmp[107U]; + uint64_t uu____4 = tmp[107U]; + uint64_t uu____5 = tmp[108U]; + tmp[117U] = (~uu____4 & ~uu____5) | (uu____4 & uu____5); + tmp[118U] = tmp[109U] ^ tmp[112U]; + uint64_t uu____6 = tmp[118U]; + uint64_t uu____7 = tmp[92U]; + tmp[119U] = (~uu____6 & ~uu____7) | (uu____6 & uu____7); + tmp[120U] = tmp[113U] ^ tmp[109U]; + uint64_t o = tmp[111U]; + output[0U] = o; + uint64_t o0 = tmp[115U]; + output[1U] = o0; + uint64_t o8 = tmp[120U]; + output[2U] = o8; + uint64_t o9 = tmp[116U]; + output[3U] = o9; + uint64_t o10 = tmp[107U]; + output[4U] = o10; + uint64_t o11 = tmp[119U]; + output[5U] = o11; + uint64_t o12 = tmp[117U]; + output[6U] = o12; + uint64_t o13 = tmp[114U]; + output[7U] = o13; + uint64_t o00 = output[0U]; + uint64_t o1 = output[1U]; + uint64_t o2 = output[2U]; + uint64_t o3 = output[3U]; + uint64_t o4 = output[4U]; + uint64_t o5 = output[5U]; + uint64_t o6 = output[6U]; + uint64_t o7 = output[7U]; + return + ( + (__uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t){ + .fst = o00, + .snd = o1, + .thd = o2, + .f3 = o3, + .f4 = o4, + .f5 = o5, + .f6 = o6, + .f7 = o7 + } + ); +} + +static void load_block0(uint64_t *out, uint8_t *inp) +{ + uint8_t *b1 = inp; + uint8_t *b2 = inp + (uint32_t)8U; + uint64_t u0 = load64_le(b1); + uint64_t fst = u0; + uint64_t u1 = load64_le(b2); + uint64_t snd = u1; + uint64_t fst1 = Lib_Transposition64x8_transpose_bits64(fst); + uint64_t snd1 = Lib_Transposition64x8_transpose_bits64(snd); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint32_t sh = i * (uint32_t)8U; + uint64_t u = fst1 >> sh & (uint64_t)0xffU; + uint64_t u10 = u ^ (snd1 >> sh & (uint64_t)0xffU) << (uint32_t)8U; + out[i] = u10;); +} + +static void transpose_state(uint64_t *st) +{ + uint64_t i0 = st[0U]; + uint64_t i1 = st[1U]; + uint64_t i2 = st[2U]; + uint64_t i3 = st[3U]; + uint64_t i4 = st[4U]; + uint64_t i5 = st[5U]; + uint64_t i6 = st[6U]; + uint64_t i7 = st[7U]; + Lib_Transposition64x8_uint64x8 + scrut = + Lib_Transposition64x8_transpose_bits64x8(( + (Lib_Transposition64x8_uint64x8){ + .fst = { .fst = { .fst = i0, .snd = i1 }, .snd = { .fst = i2, .snd = i3 } }, + .snd = { .fst = { .fst = i4, .snd = i5 }, .snd = { .fst = i6, .snd = i7 } } + } + )); + uint64_t t7 = scrut.snd.snd.snd; + uint64_t t6 = scrut.snd.snd.fst; + uint64_t t5 = scrut.snd.fst.snd; + uint64_t t4 = scrut.snd.fst.fst; + uint64_t t3 = scrut.fst.snd.snd; + uint64_t t2 = scrut.fst.snd.fst; + uint64_t t1 = scrut.fst.fst.snd; + uint64_t t0 = scrut.fst.fst.fst; + st[0U] = t0; + st[1U] = t1; + st[2U] = t2; + st[3U] = t3; + st[4U] = t4; + st[5U] = t5; + st[6U] = t6; + st[7U] = t7; +} + +void Hacl_Impl_AES_CoreBitSlice_store_block0(uint8_t *out, uint64_t *inp) +{ + uint64_t i0 = inp[0U]; + uint64_t i1 = inp[1U]; + uint64_t i2 = inp[2U]; + uint64_t i3 = inp[3U]; + uint64_t i4 = inp[4U]; + uint64_t i5 = inp[5U]; + uint64_t i6 = inp[6U]; + uint64_t i7 = inp[7U]; + Lib_Transposition64x8_uint64x8 + scrut = + Lib_Transposition64x8_transpose_bits64x8(( + (Lib_Transposition64x8_uint64x8){ + .fst = { .fst = { .fst = i0, .snd = i1 }, .snd = { .fst = i2, .snd = i3 } }, + .snd = { .fst = { .fst = i4, .snd = i5 }, .snd = { .fst = i6, .snd = i7 } } + } + )); + uint64_t t1 = scrut.fst.fst.snd; + uint64_t t0 = scrut.fst.fst.fst; + store64_le(out, t0); + store64_le(out + (uint32_t)8U, t1); +} + +void Hacl_Impl_AES_CoreBitSlice_load_key1(uint64_t *out, uint8_t *k) +{ + load_block0(out, k); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t u = out[i]; + uint64_t u1 = u ^ u << (uint32_t)16U; + uint64_t u2 = u1 ^ u1 << (uint32_t)32U; + out[i] = u2;); +} + +void Hacl_Impl_AES_CoreBitSlice_load_nonce(uint64_t *out, uint8_t *nonce1) +{ + uint8_t nb[16U] = { 0U }; + memcpy(nb, nonce1, (uint32_t)12U * sizeof (uint8_t)); + Hacl_Impl_AES_CoreBitSlice_load_key1(out, nb); +} + +void Hacl_Impl_AES_CoreBitSlice_load_state(uint64_t *out, uint64_t *nonce1, uint32_t counter) +{ + uint8_t ctr[16U] = { 0U }; + store32_be(ctr, counter); + store32_be(ctr + (uint32_t)4U, counter + (uint32_t)1U); + store32_be(ctr + (uint32_t)8U, counter + (uint32_t)2U); + store32_be(ctr + (uint32_t)12U, counter + (uint32_t)3U); + load_block0(out, ctr); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t u = out[i]; + uint64_t + u1 = ((u << (uint32_t)12U | u << (uint32_t)24U) | u << (uint32_t)36U) | u << (uint32_t)48U; + uint64_t u2 = u1 & (uint64_t)0xf000f000f000f000U; + out[i] = u2 ^ nonce1[i];); +} + +void Hacl_Impl_AES_CoreBitSlice_xor_state_key1(uint64_t *st, uint64_t *ost) +{ + KRML_MAYBE_FOR8(i, (uint32_t)0U, (uint32_t)8U, (uint32_t)1U, st[i] = st[i] ^ ost[i];); +} + +static void xor_block(uint8_t *out, uint64_t *st, uint8_t *inp) +{ + transpose_state(st); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint8_t *ob = out + i * (uint32_t)8U; + uint8_t *ib = inp + i * (uint32_t)8U; + uint64_t u = load64_le(ib); + uint64_t u0 = u; + store64_le(ob, u0 ^ st[i]);); +} + +static void sub_bytes_state(uint64_t *st) +{ + uint64_t st0 = st[0U]; + uint64_t st1 = st[1U]; + uint64_t st2 = st[2U]; + uint64_t st3 = st[3U]; + uint64_t st4 = st[4U]; + uint64_t st5 = st[5U]; + uint64_t st6 = st[6U]; + uint64_t st7 = st[7U]; + __uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t + scrut = sub_bytes64x8(st0, st1, st2, st3, st4, st5, st6, st7); + uint64_t st01 = scrut.fst; + uint64_t st11 = scrut.snd; + uint64_t st21 = scrut.thd; + uint64_t st31 = scrut.f3; + uint64_t st41 = scrut.f4; + uint64_t st51 = scrut.f5; + uint64_t st61 = scrut.f6; + uint64_t st71 = scrut.f7; + st[0U] = st01; + st[1U] = st11; + st[2U] = st21; + st[3U] = st31; + st[4U] = st41; + st[5U] = st51; + st[6U] = st61; + st[7U] = st71; +} + +static void shift_rows_state(uint64_t *st) +{ + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t rowi = st[i]; + st[i] = + ((((((rowi & (uint64_t)0x1111111111111111U) + | (rowi & (uint64_t)0x2220222022202220U) >> (uint32_t)4U) + | (rowi & (uint64_t)0x0002000200020002U) << (uint32_t)12U) + | (rowi & (uint64_t)0x4400440044004400U) >> (uint32_t)8U) + | (rowi & (uint64_t)0x0044004400440044U) << (uint32_t)8U) + | (rowi & (uint64_t)0x8000800080008000U) >> (uint32_t)12U) + | (rowi & (uint64_t)0x0888088808880888U) << (uint32_t)4U;); +} + +static void mix_columns_state(uint64_t *st) +{ + uint64_t col[8U] = { 0U }; + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t coli = st[i]; + col[i] = + coli + ^ + ((coli & (uint64_t)0xeeeeeeeeeeeeeeeeU) + >> (uint32_t)1U + | (coli & (uint64_t)0x1111111111111111U) << (uint32_t)3U);); + uint64_t col0 = col[0U]; + uint64_t + ncol0 = + col0 + ^ + ((col0 & (uint64_t)0xccccccccccccccccU) + >> (uint32_t)2U + | (col0 & (uint64_t)0x3333333333333333U) << (uint32_t)2U); + st[0U] = st[0U] ^ ncol0; + KRML_MAYBE_FOR7(i, + (uint32_t)0U, + (uint32_t)7U, + (uint32_t)1U, + uint64_t prev = col[i]; + uint64_t next = col[i + (uint32_t)1U]; + uint64_t + ncoli = + next + ^ + ((next & (uint64_t)0xccccccccccccccccU) + >> (uint32_t)2U + | (next & (uint64_t)0x3333333333333333U) << (uint32_t)2U); + st[i + (uint32_t)1U] = st[i + (uint32_t)1U] ^ (ncoli ^ prev);); + st[0U] = st[0U] ^ col[7U]; + st[1U] = st[1U] ^ col[7U]; + st[3U] = st[3U] ^ col[7U]; + st[4U] = st[4U] ^ col[7U]; +} + +void Hacl_Impl_AES_CoreBitSlice_aes_enc(uint64_t *st, uint64_t *key) +{ + sub_bytes_state(st); + shift_rows_state(st); + mix_columns_state(st); + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, key); +} + +void Hacl_Impl_AES_CoreBitSlice_aes_enc_last(uint64_t *st, uint64_t *key) +{ + sub_bytes_state(st); + shift_rows_state(st); + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, key); +} + +void +Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(uint64_t *next, uint64_t *prev, uint8_t rcon1) +{ + memcpy(next, prev, (uint32_t)8U * sizeof (uint64_t)); + sub_bytes_state(next); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t u3 = next[i] & (uint64_t)0xf000f000f000f000U; + uint64_t n = u3 >> (uint32_t)12U; + uint64_t n1 = (n >> (uint32_t)1U | n << (uint32_t)3U) & (uint64_t)0x000f000f000f000fU; + uint64_t ri = (uint64_t)(rcon1 >> i & (uint8_t)1U); + uint64_t ri1 = ri ^ ri << (uint32_t)16U; + uint64_t ri2 = ri1 ^ ri1 << (uint32_t)32U; + uint64_t n2 = n1 ^ ri2; + uint64_t n3 = n2 << (uint32_t)12U; + next[i] = n3 ^ u3 >> (uint32_t)4U;); +} + +void Hacl_Impl_AES_CoreBitSlice_key_expansion_step(uint64_t *next, uint64_t *prev) +{ + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t p = prev[i]; + uint64_t n = next[i]; + uint64_t + p1 = + p + ^ + ((p & (uint64_t)0x0fff0fff0fff0fffU) + << (uint32_t)4U + ^ + ((p & (uint64_t)0x00ff00ff00ff00ffU) + << (uint32_t)8U + ^ (p & (uint64_t)0x000f000f000f000fU) << (uint32_t)12U)); + next[i] = n ^ p1;); +} + +void +Hacl_Impl_AES_Generic_aes128_ctr_bitslice( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t counter +) +{ + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = counter + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + uint64_t st[8U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, ctr); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)10U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR9(i0, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + uint64_t *sub_key = kr + i0 * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + xor_block(ob, st, ib); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = counter + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + uint64_t st[8U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, ctr); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)10U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + uint64_t *sub_key = kr + i * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + xor_block(last, st, last); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + +void +Hacl_Impl_AES_Generic_aes256_ctr_bitslice( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t counter +) +{ + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = counter + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + uint64_t st[8U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, ctr); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)14U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR13(i0, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + uint64_t *sub_key = kr + i0 * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + xor_block(ob, st, ib); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = counter + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + uint64_t st[8U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, ctr); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)14U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + uint64_t *sub_key = kr + i * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + xor_block(last, st, last); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + +void Hacl_AES_128_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce) +{ + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + uint32_t klen = (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_load_key1(kex, key); + uint64_t *prev = kex; + uint64_t *next = kex + klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next, prev, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next, prev); + uint64_t *prev1 = kex + klen; + uint64_t *next1 = kex + (uint32_t)2U * klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next1, prev1, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next1[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next1[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next1, prev1); + uint64_t *prev2 = kex + klen * (uint32_t)2U; + uint64_t *next2 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next2, prev2, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next2[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next2[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next2, prev2); + uint64_t *prev3 = kex + klen * (uint32_t)3U; + uint64_t *next3 = kex + klen * (uint32_t)4U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next3, prev3, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next3[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next3[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next3, prev3); + uint64_t *prev4 = kex + klen * (uint32_t)4U; + uint64_t *next4 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next4, prev4, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next4[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next4[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next4, prev4); + uint64_t *prev5 = kex + klen * (uint32_t)5U; + uint64_t *next5 = kex + klen * (uint32_t)6U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next5, prev5, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next5[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next5[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next5, prev5); + uint64_t *prev6 = kex + klen * (uint32_t)6U; + uint64_t *next6 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next6, prev6, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next6[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next6[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next6, prev6); + uint64_t *prev7 = kex + klen * (uint32_t)7U; + uint64_t *next7 = kex + klen * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next7, prev7, (uint8_t)0x80U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next7[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next7[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next7, prev7); + uint64_t *prev8 = kex + klen * (uint32_t)8U; + uint64_t *next8 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next8, prev8, (uint8_t)0x1bU); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next8[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next8[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next8, prev8); + uint64_t *prev9 = kex + klen * (uint32_t)9U; + uint64_t *next9 = kex + klen * (uint32_t)10U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next9, prev9, (uint8_t)0x36U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next9[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next9[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next9, prev9); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); +} + +void Hacl_AES_128_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce) +{ + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); +} + +void Hacl_AES_128_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter) +{ + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + uint64_t st[8U] = { 0U }; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, counter); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)10U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + uint64_t *sub_key = kr + i * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + Hacl_Impl_AES_CoreBitSlice_store_block0(kb, st); +} + +inline void +Hacl_AES_128_BitSlice_aes128_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + uint64_t ctx[96U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n1 = ctx; + uint32_t klen = (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_load_key1(kex, k); + uint64_t *prev = kex; + uint64_t *next = kex + klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next, prev, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next, prev); + uint64_t *prev1 = kex + klen; + uint64_t *next1 = kex + (uint32_t)2U * klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next1, prev1, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next1[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next1[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next1, prev1); + uint64_t *prev2 = kex + klen * (uint32_t)2U; + uint64_t *next2 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next2, prev2, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next2[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next2[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next2, prev2); + uint64_t *prev3 = kex + klen * (uint32_t)3U; + uint64_t *next3 = kex + klen * (uint32_t)4U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next3, prev3, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next3[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next3[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next3, prev3); + uint64_t *prev4 = kex + klen * (uint32_t)4U; + uint64_t *next4 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next4, prev4, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next4[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next4[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next4, prev4); + uint64_t *prev5 = kex + klen * (uint32_t)5U; + uint64_t *next5 = kex + klen * (uint32_t)6U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next5, prev5, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next5[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next5[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next5, prev5); + uint64_t *prev6 = kex + klen * (uint32_t)6U; + uint64_t *next6 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next6, prev6, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next6[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next6[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next6, prev6); + uint64_t *prev7 = kex + klen * (uint32_t)7U; + uint64_t *next7 = kex + klen * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next7, prev7, (uint8_t)0x80U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next7[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next7[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next7, prev7); + uint64_t *prev8 = kex + klen * (uint32_t)8U; + uint64_t *next8 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next8, prev8, (uint8_t)0x1bU); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next8[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next8[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next8, prev8); + uint64_t *prev9 = kex + klen * (uint32_t)9U; + uint64_t *next9 = kex + klen * (uint32_t)10U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next9, prev9, (uint8_t)0x36U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next9[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next9[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next9, prev9); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n1, n); + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, out, inp, ctx, c); +} + +inline void +Hacl_AES_128_BitSlice_aes128_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + uint64_t ctx[96U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n1 = ctx; + uint32_t klen = (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_load_key1(kex, k); + uint64_t *prev = kex; + uint64_t *next = kex + klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next, prev, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next, prev); + uint64_t *prev1 = kex + klen; + uint64_t *next1 = kex + (uint32_t)2U * klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next1, prev1, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next1[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next1[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next1, prev1); + uint64_t *prev2 = kex + klen * (uint32_t)2U; + uint64_t *next2 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next2, prev2, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next2[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next2[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next2, prev2); + uint64_t *prev3 = kex + klen * (uint32_t)3U; + uint64_t *next3 = kex + klen * (uint32_t)4U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next3, prev3, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next3[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next3[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next3, prev3); + uint64_t *prev4 = kex + klen * (uint32_t)4U; + uint64_t *next4 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next4, prev4, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next4[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next4[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next4, prev4); + uint64_t *prev5 = kex + klen * (uint32_t)5U; + uint64_t *next5 = kex + klen * (uint32_t)6U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next5, prev5, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next5[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next5[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next5, prev5); + uint64_t *prev6 = kex + klen * (uint32_t)6U; + uint64_t *next6 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next6, prev6, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next6[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next6[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next6, prev6); + uint64_t *prev7 = kex + klen * (uint32_t)7U; + uint64_t *next7 = kex + klen * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next7, prev7, (uint8_t)0x80U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next7[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next7[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next7, prev7); + uint64_t *prev8 = kex + klen * (uint32_t)8U; + uint64_t *next8 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next8, prev8, (uint8_t)0x1bU); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next8[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next8[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next8, prev8); + uint64_t *prev9 = kex + klen * (uint32_t)9U; + uint64_t *next9 = kex + klen * (uint32_t)10U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next9, prev9, (uint8_t)0x36U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next9[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next9[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next9, prev9); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n1, n); + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, out, inp, ctx, c); +} + diff --git a/src/Hacl_AES_128_GCM_M32.c b/src/Hacl_AES_128_GCM_M32.c new file mode 100644 index 00000000..bd172a0e --- /dev/null +++ b/src/Hacl_AES_128_GCM_M32.c @@ -0,0 +1,208 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_128_GCM_M32.h" + +#include "internal/Hacl_AES_128_BitSlice.h" + +uint32_t Hacl_AES_128_GCM_M32_aes_gcm_ctx_len = (uint32_t)396U; + +void Hacl_AES_128_GCM_M32_aes128_gcm_init(uint64_t *ctx, uint8_t *key) +{ + uint8_t gcm_key[16U] = { 0U }; + uint8_t nonce0[12U] = { 0U }; + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)128U; + Hacl_AES_128_BitSlice_aes128_init(aes_ctx, key, nonce0); + Hacl_AES_128_BitSlice_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Hacl_Gf128_PreComp_gcm_init(gcm_ctx, gcm_key); +} + +void +Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t tmp[16U] = { 0U }; + uint8_t *cip = out; + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)128U; + uint64_t *tag_mix = ctx + (uint32_t)394U; + uint32_t ctr; + uint8_t tag_mix10[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + uint64_t *aes_ctx1 = ctx; + Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, iv); + Hacl_AES_128_BitSlice_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); + uint64_t u = load64_le(tag_mix10); + ctx[394U] = u; + uint64_t u0 = load64_le(tag_mix10 + (uint32_t)8U); + ctx[395U] = u0; + ctr = (uint32_t)2U; + } + else + { + uint64_t *aes_ctx1 = ctx; + uint64_t *gcm_ctx1 = ctx + (uint32_t)128U; + store64_be(gcm_key + (uint32_t)8U, gcm_ctx1[8U]); + store64_be(gcm_key, gcm_ctx1[9U]); + Hacl_Gf128_PreComp_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_PreComp_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, tag_iv); + uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u0; + Hacl_AES_128_BitSlice_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + uint64_t u = load64_le(tag_mix1); + ctx[394U] = u; + uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); + ctx[395U] = u1; + ctr = ctr0 + (uint32_t)1U; + } + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, cip, text, aes_ctx, ctr); + Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, len, cip); + store64_be(tmp, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(tmp + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_PreComp_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); + Hacl_Gf128_PreComp_gcm_emit(tmp, gcm_ctx); + uint64_t u0 = load64_le(tmp); + uint64_t tmp0 = u0; + uint64_t u = load64_le(tmp + (uint32_t)8U); + uint64_t tmp1 = u; + uint64_t tmp01 = tmp0 ^ tag_mix[0U]; + uint64_t tmp11 = tmp1 ^ tag_mix[1U]; + store64_le(out + len, tmp01); + store64_le(out + len + (uint32_t)8U, tmp11); + gcm_ctx[0U] = (uint64_t)0U; + gcm_ctx[1U] = (uint64_t)0U; +} + +bool +Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t scratch[18U] = { 0U }; + uint8_t *text = scratch; + uint8_t *result = scratch + (uint32_t)17U; + uint8_t *ciphertext = cipher; + uint8_t *tag = cipher + len; + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)128U; + uint64_t *tag_mix = ctx + (uint32_t)394U; + uint32_t ctr; + uint8_t tag_mix10[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + uint64_t *aes_ctx1 = ctx; + Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, iv); + Hacl_AES_128_BitSlice_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); + uint64_t u = load64_le(tag_mix10); + ctx[394U] = u; + uint64_t u0 = load64_le(tag_mix10 + (uint32_t)8U); + ctx[395U] = u0; + ctr = (uint32_t)2U; + } + else + { + uint64_t *aes_ctx1 = ctx; + uint64_t *gcm_ctx1 = ctx + (uint32_t)128U; + store64_be(gcm_key + (uint32_t)8U, gcm_ctx1[8U]); + store64_be(gcm_key, gcm_ctx1[9U]); + Hacl_Gf128_PreComp_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_PreComp_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, tag_iv); + uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u0; + Hacl_AES_128_BitSlice_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + uint64_t u = load64_le(tag_mix1); + ctx[394U] = u; + uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); + ctx[395U] = u1; + ctr = ctr0 + (uint32_t)1U; + } + Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, len, ciphertext); + store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(text + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_PreComp_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); + Hacl_Gf128_PreComp_gcm_emit(text, gcm_ctx); + uint64_t u0 = load64_le(text); + uint64_t text0 = u0; + uint64_t u = load64_le(text + (uint32_t)8U); + uint64_t text1 = u; + uint64_t text01 = text0 ^ tag_mix[0U]; + uint64_t text11 = text1 ^ tag_mix[1U]; + store64_le(text, text01); + store64_le(text + (uint32_t)8U, text11); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + result[0U] = result[0U] | (text[i] ^ tag[i]);); + uint8_t res8 = result[0U]; + if (res8 == (uint8_t)0U) + { + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, out, ciphertext, aes_ctx, ctr); + return true; + } + return false; +} + diff --git a/src/Hacl_AES_128_GCM_NI.c b/src/Hacl_AES_128_GCM_NI.c new file mode 100644 index 00000000..16e03251 --- /dev/null +++ b/src/Hacl_AES_128_GCM_NI.c @@ -0,0 +1,409 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_128_GCM_NI.h" + +void Hacl_AES_128_GCM_NI_aes128_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key) +{ + uint8_t gcm_key[16U] = { 0U }; + uint8_t nonce0[12U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Hacl_AES_128_NI_aes128_init(aes_ctx, key, nonce0); + Hacl_AES_128_NI_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Hacl_Gf128_NI_gcm_init(gcm_ctx, gcm_key); +} + +void +Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint32_t ctr; + uint8_t tag_mix0[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Hacl_AES_128_NI_aes128_set_nonce(aes_ctx, iv); + Hacl_AES_128_NI_aes128_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); + ctr = (uint32_t)2U; + } + else + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx[4U]); + Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_NI_aes128_set_nonce(aes_ctx, tag_iv); + uint32_t u = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u; + Hacl_AES_128_NI_aes128_key_block(tag_mix1, aes_ctx, ctr0); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + ctr = ctr0 + (uint32_t)1U; + } + uint8_t *cip = out; + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr1 = ctr + i * (uint32_t)4U; + uint8_t *ib = text + i * (uint32_t)64U; + uint8_t *ob = cip + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; + uint32_t counter = ctr1; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i0, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr1 = ctr + blocks64 * (uint32_t)4U; + uint8_t *ib = text + blocks64 * (uint32_t)64U; + uint8_t *ob = cip + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; + uint32_t counter = ctr1; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, cip); + uint8_t tmp[16U] = { 0U }; + store64_be(tmp, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(tmp + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_NI_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); + Hacl_Gf128_NI_gcm_emit(tmp, gcm_ctx); + Lib_IntVector_Intrinsics_vec128 tmp_vec = Lib_IntVector_Intrinsics_vec128_load128_le(tmp); + Lib_IntVector_Intrinsics_vec128 + tmp_vec1 = Lib_IntVector_Intrinsics_vec128_xor(tmp_vec, tag_mix); + Lib_IntVector_Intrinsics_vec128_store128_le(out + len, tmp_vec1); + gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; +} + +bool +Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t scratch[18U] = { 0U }; + uint8_t *text = scratch; + uint8_t *result = scratch + (uint32_t)17U; + uint8_t *ciphertext = cipher; + uint8_t *tag = cipher + len; + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; + uint32_t ctr; + uint8_t tag_mix10[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx1 = ctx; + Hacl_AES_128_NI_aes128_set_nonce(aes_ctx1, iv); + Hacl_AES_128_NI_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix10); + ctr = (uint32_t)2U; + } + else + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx1 = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx1 = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx1[4U]); + Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_NI_aes128_set_nonce(aes_ctx1, tag_iv); + uint32_t u = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u; + Hacl_AES_128_NI_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + ctr = ctr0 + (uint32_t)1U; + } + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, ciphertext); + store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(text + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_NI_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); + Hacl_Gf128_NI_gcm_emit(text, gcm_ctx); + Lib_IntVector_Intrinsics_vec128 text_vec = Lib_IntVector_Intrinsics_vec128_load128_le(text); + Lib_IntVector_Intrinsics_vec128 + text_vec1 = Lib_IntVector_Intrinsics_vec128_xor(text_vec, tag_mix); + Lib_IntVector_Intrinsics_vec128_store128_le(text, text_vec1); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + result[0U] = result[0U] | (text[i] ^ tag[i]);); + uint8_t res8 = result[0U]; + if (res8 == (uint8_t)0U) + { + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr1 = ctr + i * (uint32_t)4U; + uint8_t *ib = ciphertext + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; + uint32_t counter = ctr1; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i0, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr1 = ctr + blocks64 * (uint32_t)4U; + uint8_t *ib = ciphertext + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; + uint32_t counter = ctr1; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } + return true; + } + return false; +} + diff --git a/src/Hacl_AES_128_NI.c b/src/Hacl_AES_128_NI.c new file mode 100644 index 00000000..4a9d9ca8 --- /dev/null +++ b/src/Hacl_AES_128_NI.c @@ -0,0 +1,1084 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_128_NI.h" + +void +Hacl_AES_128_NI_aes128_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key, uint8_t *nonce) +{ + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *uu____0 = kex; + uu____0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(key); + Lib_IntVector_Intrinsics_vec128 *prev = kex; + Lib_IntVector_Intrinsics_vec128 *next = kex + klen; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev[0U], (uint8_t)0x01U); + next[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key1 = prev[0U]; + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key4 = + Lib_IntVector_Intrinsics_vec128_xor(key3, + Lib_IntVector_Intrinsics_vec128_shift_left(key3, (uint32_t)32U)); + next[0U] = Lib_IntVector_Intrinsics_vec128_xor(next[0U], key4); + Lib_IntVector_Intrinsics_vec128 *prev1 = kex + klen; + Lib_IntVector_Intrinsics_vec128 *next1 = kex + (uint32_t)2U * klen; + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x02U); + next1[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v1, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key10 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key40 = + Lib_IntVector_Intrinsics_vec128_xor(key30, + Lib_IntVector_Intrinsics_vec128_shift_left(key30, (uint32_t)32U)); + next1[0U] = Lib_IntVector_Intrinsics_vec128_xor(next1[0U], key40); + Lib_IntVector_Intrinsics_vec128 *prev2 = kex + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next2 = kex + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev2[0U], (uint8_t)0x04U); + next2[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v2, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key11 = prev2[0U]; + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key41 = + Lib_IntVector_Intrinsics_vec128_xor(key31, + Lib_IntVector_Intrinsics_vec128_shift_left(key31, (uint32_t)32U)); + next2[0U] = Lib_IntVector_Intrinsics_vec128_xor(next2[0U], key41); + Lib_IntVector_Intrinsics_vec128 *prev3 = kex + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 *next3 = kex + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev3[0U], (uint8_t)0x08U); + next3[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v3, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key12 = prev3[0U]; + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key42 = + Lib_IntVector_Intrinsics_vec128_xor(key32, + Lib_IntVector_Intrinsics_vec128_shift_left(key32, (uint32_t)32U)); + next3[0U] = Lib_IntVector_Intrinsics_vec128_xor(next3[0U], key42); + Lib_IntVector_Intrinsics_vec128 *prev4 = kex + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next4 = kex + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev4[0U], (uint8_t)0x10U); + next4[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key13 = prev4[0U]; + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key43 = + Lib_IntVector_Intrinsics_vec128_xor(key33, + Lib_IntVector_Intrinsics_vec128_shift_left(key33, (uint32_t)32U)); + next4[0U] = Lib_IntVector_Intrinsics_vec128_xor(next4[0U], key43); + Lib_IntVector_Intrinsics_vec128 *prev5 = kex + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 *next5 = kex + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev5[0U], (uint8_t)0x20U); + next5[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key14 = prev5[0U]; + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key44 = + Lib_IntVector_Intrinsics_vec128_xor(key34, + Lib_IntVector_Intrinsics_vec128_shift_left(key34, (uint32_t)32U)); + next5[0U] = Lib_IntVector_Intrinsics_vec128_xor(next5[0U], key44); + Lib_IntVector_Intrinsics_vec128 *prev6 = kex + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next6 = kex + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev6[0U], (uint8_t)0x40U); + next6[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key15 = prev6[0U]; + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key45 = + Lib_IntVector_Intrinsics_vec128_xor(key35, + Lib_IntVector_Intrinsics_vec128_shift_left(key35, (uint32_t)32U)); + next6[0U] = Lib_IntVector_Intrinsics_vec128_xor(next6[0U], key45); + Lib_IntVector_Intrinsics_vec128 *prev7 = kex + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 *next7 = kex + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev7[0U], (uint8_t)0x80U); + next7[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key16 = prev7[0U]; + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key46 = + Lib_IntVector_Intrinsics_vec128_xor(key36, + Lib_IntVector_Intrinsics_vec128_shift_left(key36, (uint32_t)32U)); + next7[0U] = Lib_IntVector_Intrinsics_vec128_xor(next7[0U], key46); + Lib_IntVector_Intrinsics_vec128 *prev8 = kex + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next8 = kex + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev8[0U], (uint8_t)0x1bU); + next8[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev8[0U]; + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key47 = + Lib_IntVector_Intrinsics_vec128_xor(key37, + Lib_IntVector_Intrinsics_vec128_shift_left(key37, (uint32_t)32U)); + next8[0U] = Lib_IntVector_Intrinsics_vec128_xor(next8[0U], key47); + Lib_IntVector_Intrinsics_vec128 *prev9 = kex + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 *next9 = kex + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev9[0U], (uint8_t)0x36U); + next9[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key18 = prev9[0U]; + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key48 = + Lib_IntVector_Intrinsics_vec128_xor(key38, + Lib_IntVector_Intrinsics_vec128_shift_left(key38, (uint32_t)32U)); + next9[0U] = Lib_IntVector_Intrinsics_vec128_xor(next9[0U], key48); + uint8_t nb[16U] = { 0U }; + memcpy(nb, nonce, (uint32_t)12U * sizeof (uint8_t)); + n[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); +} + +void Hacl_AES_128_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce) +{ + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint8_t nb[16U] = { 0U }; + memcpy(nb, nonce, (uint32_t)12U * sizeof (uint8_t)); + n[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); +} + +void +Hacl_AES_128_NI_aes128_key_block( + uint8_t *kb, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t counter +) +{ + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + uint32_t counter1 = counter; + uint32_t counter0 = htobe32(counter1); + uint32_t counter11 = htobe32(counter1 + (uint32_t)1U); + uint32_t counter2 = htobe32(counter1 + (uint32_t)2U); + uint32_t counter3 = htobe32(counter1 + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter11, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128_store128_le(kb, st[0U]); +} + +inline void +Hacl_AES_128_NI_aes128_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 ctx[12U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex0 = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n10 = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *uu____0 = kex0; + uu____0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k); + Lib_IntVector_Intrinsics_vec128 *prev = kex0; + Lib_IntVector_Intrinsics_vec128 *next = kex0 + klen; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev[0U], (uint8_t)0x01U); + next[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key = prev[0U]; + Lib_IntVector_Intrinsics_vec128 + key1 = + Lib_IntVector_Intrinsics_vec128_xor(key, + Lib_IntVector_Intrinsics_vec128_shift_left(key, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + next[0U] = Lib_IntVector_Intrinsics_vec128_xor(next[0U], key3); + Lib_IntVector_Intrinsics_vec128 *prev1 = kex0 + klen; + Lib_IntVector_Intrinsics_vec128 *next1 = kex0 + (uint32_t)2U * klen; + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x02U); + next1[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key0 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key10 = + Lib_IntVector_Intrinsics_vec128_xor(key0, + Lib_IntVector_Intrinsics_vec128_shift_left(key0, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + next1[0U] = Lib_IntVector_Intrinsics_vec128_xor(next1[0U], key30); + Lib_IntVector_Intrinsics_vec128 *prev2 = kex0 + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next2 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev2[0U], (uint8_t)0x04U); + next2[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key4 = prev2[0U]; + Lib_IntVector_Intrinsics_vec128 + key11 = + Lib_IntVector_Intrinsics_vec128_xor(key4, + Lib_IntVector_Intrinsics_vec128_shift_left(key4, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + next2[0U] = Lib_IntVector_Intrinsics_vec128_xor(next2[0U], key31); + Lib_IntVector_Intrinsics_vec128 *prev3 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 *next3 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev3[0U], (uint8_t)0x08U); + next3[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key5 = prev3[0U]; + Lib_IntVector_Intrinsics_vec128 + key12 = + Lib_IntVector_Intrinsics_vec128_xor(key5, + Lib_IntVector_Intrinsics_vec128_shift_left(key5, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + next3[0U] = Lib_IntVector_Intrinsics_vec128_xor(next3[0U], key32); + Lib_IntVector_Intrinsics_vec128 *prev4 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next4 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev4[0U], (uint8_t)0x10U); + next4[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key6 = prev4[0U]; + Lib_IntVector_Intrinsics_vec128 + key13 = + Lib_IntVector_Intrinsics_vec128_xor(key6, + Lib_IntVector_Intrinsics_vec128_shift_left(key6, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + next4[0U] = Lib_IntVector_Intrinsics_vec128_xor(next4[0U], key33); + Lib_IntVector_Intrinsics_vec128 *prev5 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 *next5 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev5[0U], (uint8_t)0x20U); + next5[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key7 = prev5[0U]; + Lib_IntVector_Intrinsics_vec128 + key14 = + Lib_IntVector_Intrinsics_vec128_xor(key7, + Lib_IntVector_Intrinsics_vec128_shift_left(key7, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + next5[0U] = Lib_IntVector_Intrinsics_vec128_xor(next5[0U], key34); + Lib_IntVector_Intrinsics_vec128 *prev6 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next6 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v9 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev6[0U], (uint8_t)0x40U); + next6[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v9, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key8 = prev6[0U]; + Lib_IntVector_Intrinsics_vec128 + key15 = + Lib_IntVector_Intrinsics_vec128_xor(key8, + Lib_IntVector_Intrinsics_vec128_shift_left(key8, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + next6[0U] = Lib_IntVector_Intrinsics_vec128_xor(next6[0U], key35); + Lib_IntVector_Intrinsics_vec128 *prev7 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 *next7 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 + v10 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev7[0U], (uint8_t)0x80U); + next7[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v10, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key9 = prev7[0U]; + Lib_IntVector_Intrinsics_vec128 + key16 = + Lib_IntVector_Intrinsics_vec128_xor(key9, + Lib_IntVector_Intrinsics_vec128_shift_left(key9, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + next7[0U] = Lib_IntVector_Intrinsics_vec128_xor(next7[0U], key36); + Lib_IntVector_Intrinsics_vec128 *prev8 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next8 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v12 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev8[0U], (uint8_t)0x1bU); + next8[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v12, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev8[0U]; + Lib_IntVector_Intrinsics_vec128 + key18 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + next8[0U] = Lib_IntVector_Intrinsics_vec128_xor(next8[0U], key37); + Lib_IntVector_Intrinsics_vec128 *prev9 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 *next9 = kex0 + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev9[0U], (uint8_t)0x36U); + next9[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key19 = prev9[0U]; + Lib_IntVector_Intrinsics_vec128 + key110 = + Lib_IntVector_Intrinsics_vec128_xor(key19, + Lib_IntVector_Intrinsics_vec128_shift_left(key19, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key110, + Lib_IntVector_Intrinsics_vec128_shift_left(key110, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + next9[0U] = Lib_IntVector_Intrinsics_vec128_xor(next9[0U], key38); + uint8_t nb[16U] = { 0U }; + memcpy(nb, n, (uint32_t)12U * sizeof (uint8_t)); + n10[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = c + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter = ctr; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i0, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = c + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter = ctr; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + +inline void +Hacl_AES_128_NI_aes128_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 ctx[12U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex0 = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n10 = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *uu____0 = kex0; + uu____0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k); + Lib_IntVector_Intrinsics_vec128 *prev = kex0; + Lib_IntVector_Intrinsics_vec128 *next = kex0 + klen; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev[0U], (uint8_t)0x01U); + next[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key = prev[0U]; + Lib_IntVector_Intrinsics_vec128 + key1 = + Lib_IntVector_Intrinsics_vec128_xor(key, + Lib_IntVector_Intrinsics_vec128_shift_left(key, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + next[0U] = Lib_IntVector_Intrinsics_vec128_xor(next[0U], key3); + Lib_IntVector_Intrinsics_vec128 *prev1 = kex0 + klen; + Lib_IntVector_Intrinsics_vec128 *next1 = kex0 + (uint32_t)2U * klen; + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x02U); + next1[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key0 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key10 = + Lib_IntVector_Intrinsics_vec128_xor(key0, + Lib_IntVector_Intrinsics_vec128_shift_left(key0, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + next1[0U] = Lib_IntVector_Intrinsics_vec128_xor(next1[0U], key30); + Lib_IntVector_Intrinsics_vec128 *prev2 = kex0 + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next2 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev2[0U], (uint8_t)0x04U); + next2[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key4 = prev2[0U]; + Lib_IntVector_Intrinsics_vec128 + key11 = + Lib_IntVector_Intrinsics_vec128_xor(key4, + Lib_IntVector_Intrinsics_vec128_shift_left(key4, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + next2[0U] = Lib_IntVector_Intrinsics_vec128_xor(next2[0U], key31); + Lib_IntVector_Intrinsics_vec128 *prev3 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 *next3 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev3[0U], (uint8_t)0x08U); + next3[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key5 = prev3[0U]; + Lib_IntVector_Intrinsics_vec128 + key12 = + Lib_IntVector_Intrinsics_vec128_xor(key5, + Lib_IntVector_Intrinsics_vec128_shift_left(key5, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + next3[0U] = Lib_IntVector_Intrinsics_vec128_xor(next3[0U], key32); + Lib_IntVector_Intrinsics_vec128 *prev4 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next4 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev4[0U], (uint8_t)0x10U); + next4[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key6 = prev4[0U]; + Lib_IntVector_Intrinsics_vec128 + key13 = + Lib_IntVector_Intrinsics_vec128_xor(key6, + Lib_IntVector_Intrinsics_vec128_shift_left(key6, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + next4[0U] = Lib_IntVector_Intrinsics_vec128_xor(next4[0U], key33); + Lib_IntVector_Intrinsics_vec128 *prev5 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 *next5 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev5[0U], (uint8_t)0x20U); + next5[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key7 = prev5[0U]; + Lib_IntVector_Intrinsics_vec128 + key14 = + Lib_IntVector_Intrinsics_vec128_xor(key7, + Lib_IntVector_Intrinsics_vec128_shift_left(key7, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + next5[0U] = Lib_IntVector_Intrinsics_vec128_xor(next5[0U], key34); + Lib_IntVector_Intrinsics_vec128 *prev6 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next6 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v9 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev6[0U], (uint8_t)0x40U); + next6[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v9, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key8 = prev6[0U]; + Lib_IntVector_Intrinsics_vec128 + key15 = + Lib_IntVector_Intrinsics_vec128_xor(key8, + Lib_IntVector_Intrinsics_vec128_shift_left(key8, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + next6[0U] = Lib_IntVector_Intrinsics_vec128_xor(next6[0U], key35); + Lib_IntVector_Intrinsics_vec128 *prev7 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 *next7 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 + v10 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev7[0U], (uint8_t)0x80U); + next7[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v10, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key9 = prev7[0U]; + Lib_IntVector_Intrinsics_vec128 + key16 = + Lib_IntVector_Intrinsics_vec128_xor(key9, + Lib_IntVector_Intrinsics_vec128_shift_left(key9, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + next7[0U] = Lib_IntVector_Intrinsics_vec128_xor(next7[0U], key36); + Lib_IntVector_Intrinsics_vec128 *prev8 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next8 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v12 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev8[0U], (uint8_t)0x1bU); + next8[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v12, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev8[0U]; + Lib_IntVector_Intrinsics_vec128 + key18 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + next8[0U] = Lib_IntVector_Intrinsics_vec128_xor(next8[0U], key37); + Lib_IntVector_Intrinsics_vec128 *prev9 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 *next9 = kex0 + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev9[0U], (uint8_t)0x36U); + next9[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key19 = prev9[0U]; + Lib_IntVector_Intrinsics_vec128 + key110 = + Lib_IntVector_Intrinsics_vec128_xor(key19, + Lib_IntVector_Intrinsics_vec128_shift_left(key19, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key110, + Lib_IntVector_Intrinsics_vec128_shift_left(key110, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + next9[0U] = Lib_IntVector_Intrinsics_vec128_xor(next9[0U], key38); + uint8_t nb[16U] = { 0U }; + memcpy(nb, n, (uint32_t)12U * sizeof (uint8_t)); + n10[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = c + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter = ctr; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i0, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = c + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter = ctr; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + diff --git a/src/Hacl_Gf128_NI.c b/src/Hacl_Gf128_NI.c new file mode 100644 index 00000000..3747dd87 --- /dev/null +++ b/src/Hacl_Gf128_NI.c @@ -0,0 +1,359 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_Gf128_NI.h" + +static inline void +fadd0(Lib_IntVector_Intrinsics_vec128 *x, Lib_IntVector_Intrinsics_vec128 *y) +{ + x[0U] = Lib_IntVector_Intrinsics_vec128_xor(x[0U], y[0U]); +} + +static inline void +fmul0(Lib_IntVector_Intrinsics_vec128 *x, Lib_IntVector_Intrinsics_vec128 *y) +{ + Lib_IntVector_Intrinsics_vec128 xe = x[0U]; + Lib_IntVector_Intrinsics_vec128 ye = y[0U]; + Lib_IntVector_Intrinsics_vec128 + lo0 = Lib_IntVector_Intrinsics_ni_clmul(xe, ye, (uint8_t)0x00U); + Lib_IntVector_Intrinsics_vec128 m1 = Lib_IntVector_Intrinsics_ni_clmul(xe, ye, (uint8_t)0x10U); + Lib_IntVector_Intrinsics_vec128 m2 = Lib_IntVector_Intrinsics_ni_clmul(xe, ye, (uint8_t)0x01U); + Lib_IntVector_Intrinsics_vec128 hi = Lib_IntVector_Intrinsics_ni_clmul(xe, ye, (uint8_t)0x11U); + Lib_IntVector_Intrinsics_vec128 m11 = Lib_IntVector_Intrinsics_vec128_xor(m1, m2); + Lib_IntVector_Intrinsics_vec128 + m21 = Lib_IntVector_Intrinsics_vec128_shift_left(m11, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + m12 = Lib_IntVector_Intrinsics_vec128_shift_right(m11, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 lo10 = Lib_IntVector_Intrinsics_vec128_xor(lo0, m21); + Lib_IntVector_Intrinsics_vec128 hi10 = Lib_IntVector_Intrinsics_vec128_xor(hi, m12); + Lib_IntVector_Intrinsics_vec128 hi0 = hi10; + Lib_IntVector_Intrinsics_vec128 lo = lo10; + Lib_IntVector_Intrinsics_vec128 + lo1 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo2 = Lib_IntVector_Intrinsics_vec128_shift_left(lo1, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + lo3 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 lo31 = Lib_IntVector_Intrinsics_vec128_xor(lo3, lo2); + Lib_IntVector_Intrinsics_vec128 + hi1 = Lib_IntVector_Intrinsics_vec128_shift_right64(hi0, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + hi11 = Lib_IntVector_Intrinsics_vec128_shift_left(hi1, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + hi2 = Lib_IntVector_Intrinsics_vec128_shift_left64(hi0, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 hi21 = Lib_IntVector_Intrinsics_vec128_xor(hi2, hi11); + Lib_IntVector_Intrinsics_vec128 + lo11 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo12 = Lib_IntVector_Intrinsics_vec128_shift_right(lo11, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 hi22 = Lib_IntVector_Intrinsics_vec128_xor(hi21, lo12); + Lib_IntVector_Intrinsics_vec128 lo4 = lo31; + Lib_IntVector_Intrinsics_vec128 hi3 = hi22; + Lib_IntVector_Intrinsics_vec128 + lo13 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo21 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)62U); + Lib_IntVector_Intrinsics_vec128 + lo32 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)57U); + Lib_IntVector_Intrinsics_vec128 lo14 = Lib_IntVector_Intrinsics_vec128_xor(lo13, lo21); + Lib_IntVector_Intrinsics_vec128 lo15 = Lib_IntVector_Intrinsics_vec128_xor(lo14, lo32); + Lib_IntVector_Intrinsics_vec128 + lo22 = Lib_IntVector_Intrinsics_vec128_shift_right(lo15, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + lo33 = Lib_IntVector_Intrinsics_vec128_shift_left(lo15, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 lo5 = Lib_IntVector_Intrinsics_vec128_xor(lo4, lo33); + Lib_IntVector_Intrinsics_vec128 lo_ = lo22; + Lib_IntVector_Intrinsics_vec128 + lo16 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo5, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 + lo23 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo5, (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 + lo34 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo5, (uint32_t)7U); + Lib_IntVector_Intrinsics_vec128 lo17 = Lib_IntVector_Intrinsics_vec128_xor(lo16, lo23); + Lib_IntVector_Intrinsics_vec128 lo18 = Lib_IntVector_Intrinsics_vec128_xor(lo17, lo34); + Lib_IntVector_Intrinsics_vec128 lo19 = Lib_IntVector_Intrinsics_vec128_xor(lo18, lo_); + Lib_IntVector_Intrinsics_vec128 lo6 = Lib_IntVector_Intrinsics_vec128_xor(lo5, lo19); + Lib_IntVector_Intrinsics_vec128 lo7 = Lib_IntVector_Intrinsics_vec128_xor(lo6, hi3); + Lib_IntVector_Intrinsics_vec128 lo110 = lo7; + x[0U] = lo110; +} + +static inline void load_precompute_r(Lib_IntVector_Intrinsics_vec128 *pre, uint8_t *key) +{ + Lib_IntVector_Intrinsics_vec128 *r4 = pre; + Lib_IntVector_Intrinsics_vec128 *r3 = pre + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *r2 = pre + (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *r1 = pre + (uint32_t)3U; + r1[0U] = Lib_IntVector_Intrinsics_vec128_load_be(key); + r4[0U] = r1[0U]; + r3[0U] = r1[0U]; + r2[0U] = r1[0U]; + fmul0(r2, r1); + fmul0(r3, r2); + fmul0(r4, r3); +} + +static inline void +normalize4( + Lib_IntVector_Intrinsics_vec128 *acc, + Lib_IntVector_Intrinsics_vec128 *x, + Lib_IntVector_Intrinsics_vec128 *pre +) +{ + Lib_IntVector_Intrinsics_vec128 x1 = x[0U]; + Lib_IntVector_Intrinsics_vec128 x2 = x[1U]; + Lib_IntVector_Intrinsics_vec128 x3 = x[2U]; + Lib_IntVector_Intrinsics_vec128 x4 = x[3U]; + Lib_IntVector_Intrinsics_vec128 y1 = pre[0U]; + Lib_IntVector_Intrinsics_vec128 y2 = pre[1U]; + Lib_IntVector_Intrinsics_vec128 y3 = pre[2U]; + Lib_IntVector_Intrinsics_vec128 y4 = pre[3U]; + Lib_IntVector_Intrinsics_vec128 + lo10 = Lib_IntVector_Intrinsics_ni_clmul(x1, y1, (uint8_t)0x00U); + Lib_IntVector_Intrinsics_vec128 + lo2 = Lib_IntVector_Intrinsics_ni_clmul(x2, y2, (uint8_t)0x00U); + Lib_IntVector_Intrinsics_vec128 + lo30 = Lib_IntVector_Intrinsics_ni_clmul(x3, y3, (uint8_t)0x00U); + Lib_IntVector_Intrinsics_vec128 + lo40 = Lib_IntVector_Intrinsics_ni_clmul(x4, y4, (uint8_t)0x00U); + Lib_IntVector_Intrinsics_vec128 lo0 = Lib_IntVector_Intrinsics_vec128_xor(lo10, lo2); + Lib_IntVector_Intrinsics_vec128 lo5 = Lib_IntVector_Intrinsics_vec128_xor(lo0, lo30); + Lib_IntVector_Intrinsics_vec128 lo6 = Lib_IntVector_Intrinsics_vec128_xor(lo5, lo40); + Lib_IntVector_Intrinsics_vec128 m1 = Lib_IntVector_Intrinsics_ni_clmul(x1, y1, (uint8_t)0x10U); + Lib_IntVector_Intrinsics_vec128 m2 = Lib_IntVector_Intrinsics_ni_clmul(x2, y2, (uint8_t)0x10U); + Lib_IntVector_Intrinsics_vec128 m3 = Lib_IntVector_Intrinsics_ni_clmul(x3, y3, (uint8_t)0x10U); + Lib_IntVector_Intrinsics_vec128 m4 = Lib_IntVector_Intrinsics_ni_clmul(x4, y4, (uint8_t)0x10U); + Lib_IntVector_Intrinsics_vec128 m = Lib_IntVector_Intrinsics_vec128_xor(m1, m2); + Lib_IntVector_Intrinsics_vec128 m5 = Lib_IntVector_Intrinsics_vec128_xor(m, m3); + Lib_IntVector_Intrinsics_vec128 m6 = Lib_IntVector_Intrinsics_vec128_xor(m5, m4); + Lib_IntVector_Intrinsics_vec128 + m11 = Lib_IntVector_Intrinsics_ni_clmul(x1, y1, (uint8_t)0x01U); + Lib_IntVector_Intrinsics_vec128 + m21 = Lib_IntVector_Intrinsics_ni_clmul(x2, y2, (uint8_t)0x01U); + Lib_IntVector_Intrinsics_vec128 + m31 = Lib_IntVector_Intrinsics_ni_clmul(x3, y3, (uint8_t)0x01U); + Lib_IntVector_Intrinsics_vec128 + m41 = Lib_IntVector_Intrinsics_ni_clmul(x4, y4, (uint8_t)0x01U); + Lib_IntVector_Intrinsics_vec128 m7 = Lib_IntVector_Intrinsics_vec128_xor(m6, m11); + Lib_IntVector_Intrinsics_vec128 m8 = Lib_IntVector_Intrinsics_vec128_xor(m7, m21); + Lib_IntVector_Intrinsics_vec128 m9 = Lib_IntVector_Intrinsics_vec128_xor(m8, m31); + Lib_IntVector_Intrinsics_vec128 m10 = Lib_IntVector_Intrinsics_vec128_xor(m9, m41); + Lib_IntVector_Intrinsics_vec128 + hi10 = Lib_IntVector_Intrinsics_ni_clmul(x1, y1, (uint8_t)0x11U); + Lib_IntVector_Intrinsics_vec128 + hi20 = Lib_IntVector_Intrinsics_ni_clmul(x2, y2, (uint8_t)0x11U); + Lib_IntVector_Intrinsics_vec128 + hi30 = Lib_IntVector_Intrinsics_ni_clmul(x3, y3, (uint8_t)0x11U); + Lib_IntVector_Intrinsics_vec128 + hi4 = Lib_IntVector_Intrinsics_ni_clmul(x4, y4, (uint8_t)0x11U); + Lib_IntVector_Intrinsics_vec128 hi = Lib_IntVector_Intrinsics_vec128_xor(hi10, hi20); + Lib_IntVector_Intrinsics_vec128 hi5 = Lib_IntVector_Intrinsics_vec128_xor(hi, hi30); + Lib_IntVector_Intrinsics_vec128 hi6 = Lib_IntVector_Intrinsics_vec128_xor(hi5, hi4); + Lib_IntVector_Intrinsics_vec128 + m12 = Lib_IntVector_Intrinsics_vec128_shift_left(m10, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + m22 = Lib_IntVector_Intrinsics_vec128_shift_right(m10, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 lo7 = Lib_IntVector_Intrinsics_vec128_xor(lo6, m12); + Lib_IntVector_Intrinsics_vec128 hi7 = Lib_IntVector_Intrinsics_vec128_xor(hi6, m22); + Lib_IntVector_Intrinsics_vec128 hi0 = hi7; + Lib_IntVector_Intrinsics_vec128 lo = lo7; + Lib_IntVector_Intrinsics_vec128 + lo1 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo20 = Lib_IntVector_Intrinsics_vec128_shift_left(lo1, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + lo3 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 lo31 = Lib_IntVector_Intrinsics_vec128_xor(lo3, lo20); + Lib_IntVector_Intrinsics_vec128 + hi1 = Lib_IntVector_Intrinsics_vec128_shift_right64(hi0, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + hi11 = Lib_IntVector_Intrinsics_vec128_shift_left(hi1, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + hi2 = Lib_IntVector_Intrinsics_vec128_shift_left64(hi0, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 hi21 = Lib_IntVector_Intrinsics_vec128_xor(hi2, hi11); + Lib_IntVector_Intrinsics_vec128 + lo11 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo12 = Lib_IntVector_Intrinsics_vec128_shift_right(lo11, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 hi22 = Lib_IntVector_Intrinsics_vec128_xor(hi21, lo12); + Lib_IntVector_Intrinsics_vec128 lo4 = lo31; + Lib_IntVector_Intrinsics_vec128 hi3 = hi22; + Lib_IntVector_Intrinsics_vec128 + lo13 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo21 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)62U); + Lib_IntVector_Intrinsics_vec128 + lo32 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)57U); + Lib_IntVector_Intrinsics_vec128 lo14 = Lib_IntVector_Intrinsics_vec128_xor(lo13, lo21); + Lib_IntVector_Intrinsics_vec128 lo15 = Lib_IntVector_Intrinsics_vec128_xor(lo14, lo32); + Lib_IntVector_Intrinsics_vec128 + lo22 = Lib_IntVector_Intrinsics_vec128_shift_right(lo15, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + lo33 = Lib_IntVector_Intrinsics_vec128_shift_left(lo15, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 lo50 = Lib_IntVector_Intrinsics_vec128_xor(lo4, lo33); + Lib_IntVector_Intrinsics_vec128 lo_ = lo22; + Lib_IntVector_Intrinsics_vec128 + lo16 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo50, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 + lo23 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo50, (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 + lo34 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo50, (uint32_t)7U); + Lib_IntVector_Intrinsics_vec128 lo17 = Lib_IntVector_Intrinsics_vec128_xor(lo16, lo23); + Lib_IntVector_Intrinsics_vec128 lo18 = Lib_IntVector_Intrinsics_vec128_xor(lo17, lo34); + Lib_IntVector_Intrinsics_vec128 lo19 = Lib_IntVector_Intrinsics_vec128_xor(lo18, lo_); + Lib_IntVector_Intrinsics_vec128 lo60 = Lib_IntVector_Intrinsics_vec128_xor(lo50, lo19); + Lib_IntVector_Intrinsics_vec128 lo70 = Lib_IntVector_Intrinsics_vec128_xor(lo60, hi3); + Lib_IntVector_Intrinsics_vec128 lo110 = lo70; + acc[0U] = lo110; +} + +void Hacl_Gf128_NI_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key) +{ + Lib_IntVector_Intrinsics_vec128 *acc = ctx; + Lib_IntVector_Intrinsics_vec128 *pre = ctx + (uint32_t)1U; + acc[0U] = Lib_IntVector_Intrinsics_vec128_zero; + load_precompute_r(pre, key); +} + +void +Hacl_Gf128_NI_gcm_update_blocks( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *text +) +{ + Lib_IntVector_Intrinsics_vec128 *acc = ctx; + Lib_IntVector_Intrinsics_vec128 *pre = ctx + (uint32_t)1U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 f[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *b4 = f; + uint32_t nb = len0 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t0 + i * (uint32_t)64U; + b4[0U] = Lib_IntVector_Intrinsics_vec128_load_be(tb); + b4[1U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)16U); + b4[2U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)32U); + b4[3U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)48U); + b4[0U] = Lib_IntVector_Intrinsics_vec128_xor(acc[0U], b4[0U]); + normalize4(acc, b4, pre); + } + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + Lib_IntVector_Intrinsics_vec128 *r1 = pre + (uint32_t)3U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 elem = Lib_IntVector_Intrinsics_vec128_zero; + elem = Lib_IntVector_Intrinsics_vec128_load_be(tb); + fadd0(acc, &elem); + fmul0(acc, r1); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 elem = Lib_IntVector_Intrinsics_vec128_zero; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + elem = Lib_IntVector_Intrinsics_vec128_load_be(b); + fadd0(acc, &elem); + fmul0(acc, r1); + return; + } +} + +void +(*Hacl_Gf128_NI_gcm_update_padded)( + Lib_IntVector_Intrinsics_vec128 *x0, + uint32_t x1, + uint8_t *x2 +) = Hacl_Gf128_NI_gcm_update_blocks; + +void Hacl_Gf128_NI_gcm_emit(uint8_t *tag, Lib_IntVector_Intrinsics_vec128 *ctx) +{ + Lib_IntVector_Intrinsics_vec128 *acc = ctx; + Lib_IntVector_Intrinsics_vec128_store_be(tag, acc[0U]); +} + +void Hacl_Gf128_NI_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key) +{ + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 ctx[5U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *acc = ctx; + Lib_IntVector_Intrinsics_vec128 *pre0 = ctx + (uint32_t)1U; + acc[0U] = Lib_IntVector_Intrinsics_vec128_zero; + load_precompute_r(pre0, key); + Lib_IntVector_Intrinsics_vec128 *acc0 = ctx; + Lib_IntVector_Intrinsics_vec128 *pre = ctx + (uint32_t)1U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 f[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *b4 = f; + uint32_t nb = len0 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t0 + i * (uint32_t)64U; + b4[0U] = Lib_IntVector_Intrinsics_vec128_load_be(tb); + b4[1U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)16U); + b4[2U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)32U); + b4[3U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)48U); + b4[0U] = Lib_IntVector_Intrinsics_vec128_xor(acc0[0U], b4[0U]); + normalize4(acc0, b4, pre); + } + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + Lib_IntVector_Intrinsics_vec128 *r1 = pre + (uint32_t)3U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 elem = Lib_IntVector_Intrinsics_vec128_zero; + elem = Lib_IntVector_Intrinsics_vec128_load_be(tb); + fadd0(acc0, &elem); + fmul0(acc0, r1); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 elem = Lib_IntVector_Intrinsics_vec128_zero; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + elem = Lib_IntVector_Intrinsics_vec128_load_be(b); + fadd0(acc0, &elem); + fmul0(acc0, r1); + } + Lib_IntVector_Intrinsics_vec128 *acc1 = ctx; + Lib_IntVector_Intrinsics_vec128_store_be(tag, acc1[0U]); +} + diff --git a/src/Hacl_Gf128_PreComp.c b/src/Hacl_Gf128_PreComp.c new file mode 100644 index 00000000..fa12b870 --- /dev/null +++ b/src/Hacl_Gf128_PreComp.c @@ -0,0 +1,461 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_Gf128_PreComp.h" + +void Hacl_Impl_Gf128_FieldPreComp_fmul(uint64_t *x, uint64_t *y) +{ + uint64_t res[2U] = { 0U }; + uint64_t y_[2U] = { 0U }; + y_[0U] = y[0U]; + y_[1U] = y[1U]; + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + uint64_t m = (uint64_t)0U - (x[1U] >> ((uint32_t)63U - i) & (uint64_t)1U); + res[0U] = res[0U] ^ (y_[0U] & m); + res[1U] = res[1U] ^ (y_[1U] & m); + uint64_t m0 = (uint64_t)0U - (y_[0U] & (uint64_t)1U); + y_[0U] = y_[0U] >> (uint32_t)1U | y_[1U] << (uint32_t)63U; + y_[1U] = y_[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m0); + } + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + uint64_t m = (uint64_t)0U - (x[0U] >> ((uint32_t)63U - i) & (uint64_t)1U); + res[0U] = res[0U] ^ (y_[0U] & m); + res[1U] = res[1U] ^ (y_[1U] & m); + uint64_t m0 = (uint64_t)0U - (y_[0U] & (uint64_t)1U); + y_[0U] = y_[0U] >> (uint32_t)1U | y_[1U] << (uint32_t)63U; + y_[1U] = y_[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m0); + } + x[0U] = res[0U]; + x[1U] = res[1U]; +} + +static inline void prepare(uint64_t *pre, uint64_t *r) +{ + memset(pre, 0U, (uint32_t)256U * sizeof (uint64_t)); + uint64_t sh[2U] = { 0U }; + sh[0U] = r[0U]; + sh[1U] = r[1U]; + uint64_t *pre1 = pre; + uint64_t *pre2 = pre + (uint32_t)128U; + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + memcpy(pre1 + (uint32_t)2U * i, sh, (uint32_t)2U * sizeof (uint64_t)); + uint64_t m = (uint64_t)0U - (sh[0U] & (uint64_t)1U); + sh[0U] = sh[0U] >> (uint32_t)1U | sh[1U] << (uint32_t)63U; + sh[1U] = sh[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m); + } + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + memcpy(pre2 + (uint32_t)2U * i, sh, (uint32_t)2U * sizeof (uint64_t)); + uint64_t m = (uint64_t)0U - (sh[0U] & (uint64_t)1U); + sh[0U] = sh[0U] >> (uint32_t)1U | sh[1U] << (uint32_t)63U; + sh[1U] = sh[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m); + } +} + +void Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(uint64_t *pre, uint8_t *key) +{ + uint64_t *r4321 = pre; + uint64_t *r1 = r4321 + (uint32_t)6U; + uint64_t *r2 = r4321 + (uint32_t)4U; + uint64_t *r3 = r4321 + (uint32_t)2U; + uint64_t *r4 = r4321; + uint64_t *table2 = pre + (uint32_t)8U; + uint64_t u = load64_be(key); + r1[1U] = u; + uint64_t u0 = load64_be(key + (uint32_t)8U); + r1[0U] = u0; + r4[0U] = r1[0U]; + r4[1U] = r1[1U]; + r3[0U] = r1[0U]; + r3[1U] = r1[1U]; + r2[0U] = r1[0U]; + r2[1U] = r1[1U]; + Hacl_Impl_Gf128_FieldPreComp_fmul(r2, r1); + Hacl_Impl_Gf128_FieldPreComp_fmul(r3, r2); + Hacl_Impl_Gf128_FieldPreComp_fmul(r4, r3); + prepare(table2, r4); +} + +static inline void fmul_pre(uint64_t *x, uint64_t *pre) +{ + uint64_t *tab = pre + (uint32_t)8U; + uint64_t tmp[2U] = { 0U }; + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + uint64_t *uu____0 = tab + (uint32_t)2U * i; + uint64_t m = (uint64_t)0U - (x[1U] >> ((uint32_t)63U - i) & (uint64_t)1U); + tmp[0U] = tmp[0U] ^ (uu____0[0U] & m); + tmp[1U] = tmp[1U] ^ (uu____0[1U] & m); + } + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + uint64_t *uu____1 = tab + (uint32_t)128U + (uint32_t)2U * i; + uint64_t m = (uint64_t)0U - (x[0U] >> ((uint32_t)63U - i) & (uint64_t)1U); + tmp[0U] = tmp[0U] ^ (uu____1[0U] & m); + tmp[1U] = tmp[1U] ^ (uu____1[1U] & m); + } + x[0U] = tmp[0U]; + x[1U] = tmp[1U]; +} + +void Hacl_Impl_Gf128_FieldPreComp_fmul_r4(uint64_t *x, uint64_t *pre) +{ + fmul_pre(x, pre); + fmul_pre(x + (uint32_t)2U, pre); + fmul_pre(x + (uint32_t)4U, pre); + fmul_pre(x + (uint32_t)6U, pre); +} + +void Hacl_Impl_Gf128_FieldPreComp_normalize4(uint64_t *acc, uint64_t *x, uint64_t *pre) +{ + uint64_t *x1 = x; + uint64_t *x2 = x + (uint32_t)2U; + uint64_t *x3 = x + (uint32_t)4U; + uint64_t *x4 = x + (uint32_t)6U; + fmul_pre(x, pre); + Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)2U, pre + (uint32_t)2U); + Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)4U, pre + (uint32_t)4U); + Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)6U, pre + (uint32_t)6U); + acc[0U] = x1[0U]; + acc[1U] = x1[1U]; + acc[0U] = acc[0U] ^ x2[0U]; + acc[1U] = acc[1U] ^ x2[1U]; + acc[0U] = acc[0U] ^ x3[0U]; + acc[1U] = acc[1U] ^ x3[1U]; + acc[0U] = acc[0U] ^ x4[0U]; + acc[1U] = acc[1U] ^ x4[1U]; +} + +void Hacl_Gf128_PreComp_gcm_init(uint64_t *ctx, uint8_t *key) +{ + uint64_t *acc = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + acc[0U] = (uint64_t)0U; + acc[1U] = (uint64_t)0U; + Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(pre, key); +} + +void Hacl_Gf128_PreComp_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text) +{ + uint64_t *acc = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + uint64_t f0[8U] = { 0U }; + uint64_t *b4 = f0; + uint64_t f[8U] = { 0U }; + uint64_t *acc4 = f; + uint8_t *tb = t0; + memcpy(acc4, acc, (uint32_t)2U * sizeof (uint64_t)); + uint64_t *x00 = b4; + uint8_t *y00 = tb; + uint64_t *x10 = b4 + (uint32_t)2U; + uint8_t *y10 = tb + (uint32_t)16U; + uint64_t *x20 = b4 + (uint32_t)4U; + uint8_t *y20 = tb + (uint32_t)32U; + uint64_t *x30 = b4 + (uint32_t)6U; + uint8_t *y30 = tb + (uint32_t)48U; + uint64_t u0 = load64_be(y00); + x00[1U] = u0; + uint64_t u1 = load64_be(y00 + (uint32_t)8U); + x00[0U] = u1; + uint64_t u2 = load64_be(y10); + x10[1U] = u2; + uint64_t u3 = load64_be(y10 + (uint32_t)8U); + x10[0U] = u3; + uint64_t u4 = load64_be(y20); + x20[1U] = u4; + uint64_t u5 = load64_be(y20 + (uint32_t)8U); + x20[0U] = u5; + uint64_t u6 = load64_be(y30); + x30[1U] = u6; + uint64_t u7 = load64_be(y30 + (uint32_t)8U); + x30[0U] = u7; + uint64_t *x01 = acc4; + uint64_t *y01 = b4; + uint64_t *x11 = acc4 + (uint32_t)2U; + uint64_t *y11 = b4 + (uint32_t)2U; + uint64_t *x21 = acc4 + (uint32_t)4U; + uint64_t *y21 = b4 + (uint32_t)4U; + uint64_t *x31 = acc4 + (uint32_t)6U; + uint64_t *y31 = b4 + (uint32_t)6U; + x01[0U] = x01[0U] ^ y01[0U]; + x01[1U] = x01[1U] ^ y01[1U]; + x11[0U] = x11[0U] ^ y11[0U]; + x11[1U] = x11[1U] ^ y11[1U]; + x21[0U] = x21[0U] ^ y21[0U]; + x21[1U] = x21[1U] ^ y21[1U]; + x31[0U] = x31[0U] ^ y31[0U]; + x31[1U] = x31[1U] ^ y31[1U]; + uint32_t len1 = len0 - (uint32_t)64U; + uint8_t *text1 = t0 + (uint32_t)64U; + uint32_t nb = len1 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb1 = text1 + i * (uint32_t)64U; + uint64_t *x0 = b4; + uint8_t *y02 = tb1; + uint64_t *x12 = b4 + (uint32_t)2U; + uint8_t *y12 = tb1 + (uint32_t)16U; + uint64_t *x22 = b4 + (uint32_t)4U; + uint8_t *y22 = tb1 + (uint32_t)32U; + uint64_t *x32 = b4 + (uint32_t)6U; + uint8_t *y32 = tb1 + (uint32_t)48U; + uint64_t u = load64_be(y02); + x0[1U] = u; + uint64_t u8 = load64_be(y02 + (uint32_t)8U); + x0[0U] = u8; + uint64_t u9 = load64_be(y12); + x12[1U] = u9; + uint64_t u10 = load64_be(y12 + (uint32_t)8U); + x12[0U] = u10; + uint64_t u11 = load64_be(y22); + x22[1U] = u11; + uint64_t u12 = load64_be(y22 + (uint32_t)8U); + x22[0U] = u12; + uint64_t u13 = load64_be(y32); + x32[1U] = u13; + uint64_t u14 = load64_be(y32 + (uint32_t)8U); + x32[0U] = u14; + Hacl_Impl_Gf128_FieldPreComp_fmul_r4(acc4, pre); + uint64_t *x02 = acc4; + uint64_t *y0 = b4; + uint64_t *x1 = acc4 + (uint32_t)2U; + uint64_t *y1 = b4 + (uint32_t)2U; + uint64_t *x2 = acc4 + (uint32_t)4U; + uint64_t *y2 = b4 + (uint32_t)4U; + uint64_t *x3 = acc4 + (uint32_t)6U; + uint64_t *y3 = b4 + (uint32_t)6U; + x02[0U] = x02[0U] ^ y0[0U]; + x02[1U] = x02[1U] ^ y0[1U]; + x1[0U] = x1[0U] ^ y1[0U]; + x1[1U] = x1[1U] ^ y1[1U]; + x2[0U] = x2[0U] ^ y2[0U]; + x2[1U] = x2[1U] ^ y2[1U]; + x3[0U] = x3[0U] ^ y3[0U]; + x3[1U] = x3[1U] ^ y3[1U]; + } + Hacl_Impl_Gf128_FieldPreComp_normalize4(acc, acc4, pre); + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + uint64_t *r1 = pre + (uint32_t)6U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint64_t u = load64_be(tb); + elem[1U] = u; + uint64_t u0 = load64_be(tb + (uint32_t)8U); + elem[0U] = u0; + acc[0U] = acc[0U] ^ elem[0U]; + acc[1U] = acc[1U] ^ elem[1U]; + Hacl_Impl_Gf128_FieldPreComp_fmul(acc, r1); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + uint64_t u = load64_be(b); + elem[1U] = u; + uint64_t u0 = load64_be(b + (uint32_t)8U); + elem[0U] = u0; + acc[0U] = acc[0U] ^ elem[0U]; + acc[1U] = acc[1U] ^ elem[1U]; + Hacl_Impl_Gf128_FieldPreComp_fmul(acc, r1); + return; + } +} + +void +(*Hacl_Gf128_PreComp_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2) = + Hacl_Gf128_PreComp_gcm_update_blocks; + +void Hacl_Gf128_PreComp_gcm_emit(uint8_t *tag, uint64_t *ctx) +{ + uint64_t *acc = ctx; + uint64_t r0 = acc[1U]; + uint64_t r1 = acc[0U]; + store64_be(tag, r0); + store64_be(tag + (uint32_t)8U, r1); +} + +void Hacl_Gf128_PreComp_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key) +{ + uint64_t ctx[266U] = { 0U }; + uint64_t *acc = ctx; + uint64_t *pre0 = ctx + (uint32_t)2U; + acc[0U] = (uint64_t)0U; + acc[1U] = (uint64_t)0U; + Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(pre0, key); + uint64_t *acc0 = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + uint64_t f0[8U] = { 0U }; + uint64_t *b4 = f0; + uint64_t f[8U] = { 0U }; + uint64_t *acc4 = f; + uint8_t *tb = t0; + memcpy(acc4, acc0, (uint32_t)2U * sizeof (uint64_t)); + uint64_t *x00 = b4; + uint8_t *y00 = tb; + uint64_t *x10 = b4 + (uint32_t)2U; + uint8_t *y10 = tb + (uint32_t)16U; + uint64_t *x20 = b4 + (uint32_t)4U; + uint8_t *y20 = tb + (uint32_t)32U; + uint64_t *x30 = b4 + (uint32_t)6U; + uint8_t *y30 = tb + (uint32_t)48U; + uint64_t u0 = load64_be(y00); + x00[1U] = u0; + uint64_t u1 = load64_be(y00 + (uint32_t)8U); + x00[0U] = u1; + uint64_t u2 = load64_be(y10); + x10[1U] = u2; + uint64_t u3 = load64_be(y10 + (uint32_t)8U); + x10[0U] = u3; + uint64_t u4 = load64_be(y20); + x20[1U] = u4; + uint64_t u5 = load64_be(y20 + (uint32_t)8U); + x20[0U] = u5; + uint64_t u6 = load64_be(y30); + x30[1U] = u6; + uint64_t u7 = load64_be(y30 + (uint32_t)8U); + x30[0U] = u7; + uint64_t *x01 = acc4; + uint64_t *y01 = b4; + uint64_t *x11 = acc4 + (uint32_t)2U; + uint64_t *y11 = b4 + (uint32_t)2U; + uint64_t *x21 = acc4 + (uint32_t)4U; + uint64_t *y21 = b4 + (uint32_t)4U; + uint64_t *x31 = acc4 + (uint32_t)6U; + uint64_t *y31 = b4 + (uint32_t)6U; + x01[0U] = x01[0U] ^ y01[0U]; + x01[1U] = x01[1U] ^ y01[1U]; + x11[0U] = x11[0U] ^ y11[0U]; + x11[1U] = x11[1U] ^ y11[1U]; + x21[0U] = x21[0U] ^ y21[0U]; + x21[1U] = x21[1U] ^ y21[1U]; + x31[0U] = x31[0U] ^ y31[0U]; + x31[1U] = x31[1U] ^ y31[1U]; + uint32_t len1 = len0 - (uint32_t)64U; + uint8_t *text1 = t0 + (uint32_t)64U; + uint32_t nb = len1 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb1 = text1 + i * (uint32_t)64U; + uint64_t *x0 = b4; + uint8_t *y02 = tb1; + uint64_t *x12 = b4 + (uint32_t)2U; + uint8_t *y12 = tb1 + (uint32_t)16U; + uint64_t *x22 = b4 + (uint32_t)4U; + uint8_t *y22 = tb1 + (uint32_t)32U; + uint64_t *x32 = b4 + (uint32_t)6U; + uint8_t *y32 = tb1 + (uint32_t)48U; + uint64_t u = load64_be(y02); + x0[1U] = u; + uint64_t u8 = load64_be(y02 + (uint32_t)8U); + x0[0U] = u8; + uint64_t u9 = load64_be(y12); + x12[1U] = u9; + uint64_t u10 = load64_be(y12 + (uint32_t)8U); + x12[0U] = u10; + uint64_t u11 = load64_be(y22); + x22[1U] = u11; + uint64_t u12 = load64_be(y22 + (uint32_t)8U); + x22[0U] = u12; + uint64_t u13 = load64_be(y32); + x32[1U] = u13; + uint64_t u14 = load64_be(y32 + (uint32_t)8U); + x32[0U] = u14; + Hacl_Impl_Gf128_FieldPreComp_fmul_r4(acc4, pre); + uint64_t *x02 = acc4; + uint64_t *y0 = b4; + uint64_t *x1 = acc4 + (uint32_t)2U; + uint64_t *y1 = b4 + (uint32_t)2U; + uint64_t *x2 = acc4 + (uint32_t)4U; + uint64_t *y2 = b4 + (uint32_t)4U; + uint64_t *x3 = acc4 + (uint32_t)6U; + uint64_t *y3 = b4 + (uint32_t)6U; + x02[0U] = x02[0U] ^ y0[0U]; + x02[1U] = x02[1U] ^ y0[1U]; + x1[0U] = x1[0U] ^ y1[0U]; + x1[1U] = x1[1U] ^ y1[1U]; + x2[0U] = x2[0U] ^ y2[0U]; + x2[1U] = x2[1U] ^ y2[1U]; + x3[0U] = x3[0U] ^ y3[0U]; + x3[1U] = x3[1U] ^ y3[1U]; + } + Hacl_Impl_Gf128_FieldPreComp_normalize4(acc0, acc4, pre); + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + uint64_t *r10 = pre + (uint32_t)6U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint64_t u = load64_be(tb); + elem[1U] = u; + uint64_t u0 = load64_be(tb + (uint32_t)8U); + elem[0U] = u0; + acc0[0U] = acc0[0U] ^ elem[0U]; + acc0[1U] = acc0[1U] ^ elem[1U]; + Hacl_Impl_Gf128_FieldPreComp_fmul(acc0, r10); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + uint64_t u = load64_be(b); + elem[1U] = u; + uint64_t u0 = load64_be(b + (uint32_t)8U); + elem[0U] = u0; + acc0[0U] = acc0[0U] ^ elem[0U]; + acc0[1U] = acc0[1U] ^ elem[1U]; + Hacl_Impl_Gf128_FieldPreComp_fmul(acc0, r10); + } + uint64_t *acc1 = ctx; + uint64_t r0 = acc1[1U]; + uint64_t r1 = acc1[0U]; + store64_be(tag, r0); + store64_be(tag + (uint32_t)8U, r1); +} + diff --git a/src/Hacl_Lib.c b/src/Hacl_Lib.c new file mode 100644 index 00000000..5be84b2b --- /dev/null +++ b/src/Hacl_Lib.c @@ -0,0 +1,193 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "internal/Hacl_Lib.h" + +static Lib_Transposition64x8_uint64x2 transpose_aux_aux32(uint64_t a, uint64_t b) +{ + uint64_t m = (uint64_t)18446744069414584320U; + return + ( + (Lib_Transposition64x8_uint64x2){ + .fst = (a & ~m) ^ (b << (uint32_t)32U & m), + .snd = (a >> (uint32_t)32U & ~m) ^ (b & m) + } + ); +} + +static Lib_Transposition64x8_uint64x2 transpose_aux_aux16(uint64_t a, uint64_t b) +{ + uint64_t m = (uint64_t)18446462603027742720U; + return + ( + (Lib_Transposition64x8_uint64x2){ + .fst = (a & ~m) ^ (b << (uint32_t)16U & m), + .snd = (a >> (uint32_t)16U & ~m) ^ (b & m) + } + ); +} + +static Lib_Transposition64x8_uint64x2 transpose_aux_aux8(uint64_t a, uint64_t b) +{ + uint64_t m = (uint64_t)18374966859414961920U; + return + ( + (Lib_Transposition64x8_uint64x2){ + .fst = (a & ~m) ^ (b << (uint32_t)8U & m), + .snd = (a >> (uint32_t)8U & ~m) ^ (b & m) + } + ); +} + +static Lib_Transposition64x8_uint64x8 transpose_aux32(Lib_Transposition64x8_uint64x8 x) +{ + uint64_t x7 = x.snd.snd.snd; + uint64_t x6 = x.snd.snd.fst; + uint64_t x5 = x.snd.fst.snd; + uint64_t x4 = x.snd.fst.fst; + uint64_t x3 = x.fst.snd.snd; + uint64_t x2 = x.fst.snd.fst; + uint64_t x1 = x.fst.fst.snd; + uint64_t x0 = x.fst.fst.fst; + Lib_Transposition64x8_uint64x2 scrut0 = transpose_aux_aux32(x0, x4); + uint64_t y0 = scrut0.fst; + uint64_t y4 = scrut0.snd; + Lib_Transposition64x8_uint64x2 scrut1 = transpose_aux_aux32(x1, x5); + uint64_t y1 = scrut1.fst; + uint64_t y5 = scrut1.snd; + Lib_Transposition64x8_uint64x2 scrut2 = transpose_aux_aux32(x2, x6); + uint64_t y2 = scrut2.fst; + uint64_t y6 = scrut2.snd; + Lib_Transposition64x8_uint64x2 scrut = transpose_aux_aux32(x3, x7); + uint64_t y3 = scrut.fst; + uint64_t y7 = scrut.snd; + return + ( + (Lib_Transposition64x8_uint64x8){ + .fst = { .fst = { .fst = y0, .snd = y1 }, .snd = { .fst = y2, .snd = y3 } }, + .snd = { .fst = { .fst = y4, .snd = y5 }, .snd = { .fst = y6, .snd = y7 } } + } + ); +} + +static Lib_Transposition64x8_uint64x4 transpose_aux16(Lib_Transposition64x8_uint64x4 x) +{ + uint64_t x3 = x.snd.snd; + uint64_t x2 = x.snd.fst; + uint64_t x1 = x.fst.snd; + uint64_t x0 = x.fst.fst; + Lib_Transposition64x8_uint64x2 scrut0 = transpose_aux_aux16(x0, x2); + uint64_t y0 = scrut0.fst; + uint64_t y2 = scrut0.snd; + Lib_Transposition64x8_uint64x2 scrut = transpose_aux_aux16(x1, x3); + uint64_t y1 = scrut.fst; + uint64_t y3 = scrut.snd; + return + ( + (Lib_Transposition64x8_uint64x4){ + .fst = { .fst = y0, .snd = y1 }, + .snd = { .fst = y2, .snd = y3 } + } + ); +} + +static Lib_Transposition64x8_uint64x2 transpose_aux8(Lib_Transposition64x8_uint64x2 x) +{ + uint64_t x0 = x.fst; + uint64_t x1 = x.snd; + Lib_Transposition64x8_uint64x2 scrut = transpose_aux_aux8(x0, x1); + uint64_t y0 = scrut.fst; + uint64_t y1 = scrut.snd; + return ((Lib_Transposition64x8_uint64x2){ .fst = y0, .snd = y1 }); +} + +uint64_t Lib_Transposition64x8_transpose_bits64(uint64_t x) +{ + uint64_t m0 = (uint64_t)0x8040201008040201U; + uint64_t m1 = (uint64_t)0x4020100804020100U; + uint64_t m2 = (uint64_t)0x2010080402010000U; + uint64_t m3 = (uint64_t)0x1008040201000000U; + uint64_t m4 = (uint64_t)0x0804020100000000U; + uint64_t m5 = (uint64_t)0x0402010000000000U; + uint64_t m6 = (uint64_t)0x0201000000000000U; + uint64_t m7 = (uint64_t)0x0100000000000000U; + uint64_t y0 = x & m0; + uint64_t y1 = y0 | (x & m1) >> (uint32_t)7U; + uint64_t y2 = y1 | (x & m2) >> (uint32_t)14U; + uint64_t y3 = y2 | (x & m3) >> (uint32_t)21U; + uint64_t y4 = y3 | (x & m4) >> (uint32_t)28U; + uint64_t y5 = y4 | (x & m5) >> (uint32_t)35U; + uint64_t y6 = y5 | (x & m6) >> (uint32_t)42U; + uint64_t y7 = y6 | (x & m7) >> (uint32_t)49U; + uint64_t y8 = y7 | (x << (uint32_t)7U & m1); + uint64_t y9 = y8 | (x << (uint32_t)14U & m2); + uint64_t y10 = y9 | (x << (uint32_t)21U & m3); + uint64_t y11 = y10 | (x << (uint32_t)28U & m4); + uint64_t y12 = y11 | (x << (uint32_t)35U & m5); + uint64_t y13 = y12 | (x << (uint32_t)42U & m6); + return y13 | (x << (uint32_t)49U & m7); +} + +Lib_Transposition64x8_uint64x8 +Lib_Transposition64x8_transpose_bits64x8(Lib_Transposition64x8_uint64x8 a) +{ + Lib_Transposition64x8_uint64x8 scrut0 = transpose_aux32(a); + Lib_Transposition64x8_uint64x4 b0 = scrut0.fst; + Lib_Transposition64x8_uint64x4 b1 = scrut0.snd; + Lib_Transposition64x8_uint64x4 scrut1 = transpose_aux16(b0); + Lib_Transposition64x8_uint64x2 c0 = scrut1.fst; + Lib_Transposition64x8_uint64x2 c1 = scrut1.snd; + Lib_Transposition64x8_uint64x4 scrut2 = transpose_aux16(b1); + Lib_Transposition64x8_uint64x2 c2 = scrut2.fst; + Lib_Transposition64x8_uint64x2 c3 = scrut2.snd; + Lib_Transposition64x8_uint64x2 scrut3 = transpose_aux8(c0); + uint64_t d0 = scrut3.fst; + uint64_t d1 = scrut3.snd; + Lib_Transposition64x8_uint64x2 scrut4 = transpose_aux8(c1); + uint64_t d2 = scrut4.fst; + uint64_t d3 = scrut4.snd; + Lib_Transposition64x8_uint64x2 scrut5 = transpose_aux8(c2); + uint64_t d4 = scrut5.fst; + uint64_t d5 = scrut5.snd; + Lib_Transposition64x8_uint64x2 scrut = transpose_aux8(c3); + uint64_t d6 = scrut.fst; + uint64_t d7 = scrut.snd; + uint64_t e0 = Lib_Transposition64x8_transpose_bits64(d0); + uint64_t e1 = Lib_Transposition64x8_transpose_bits64(d1); + uint64_t e2 = Lib_Transposition64x8_transpose_bits64(d2); + uint64_t e3 = Lib_Transposition64x8_transpose_bits64(d3); + uint64_t e4 = Lib_Transposition64x8_transpose_bits64(d4); + uint64_t e5 = Lib_Transposition64x8_transpose_bits64(d5); + uint64_t e6 = Lib_Transposition64x8_transpose_bits64(d6); + uint64_t e7 = Lib_Transposition64x8_transpose_bits64(d7); + return + ( + (Lib_Transposition64x8_uint64x8){ + .fst = { .fst = { .fst = e0, .snd = e1 }, .snd = { .fst = e2, .snd = e3 } }, + .snd = { .fst = { .fst = e4, .snd = e5 }, .snd = { .fst = e6, .snd = e7 } } + } + ); +} + diff --git a/third-party/bearssl/README.md b/third-party/bearssl/README.md new file mode 100644 index 00000000..e194299c --- /dev/null +++ b/third-party/bearssl/README.md @@ -0,0 +1 @@ +We took the code from BearSSL repository: https://bearssl.org/gitweb/?p=BearSSL;a=commit;h=79c060eea3eea1257797f15ea1608a9a9923aa6f diff --git a/third-party/bearssl/aes_ct64.c b/third-party/bearssl/aes_ct64.c new file mode 100644 index 00000000..c4d3c357 --- /dev/null +++ b/third-party/bearssl/aes_ct64.c @@ -0,0 +1,400 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "inner.h" + +/* see inner.h */ +void +br_aes_ct64_bitslice_Sbox(uint64_t *q) +{ + /* + * This S-box implementation is a straightforward translation of + * the circuit described by Boyar and Peralta in "A new + * combinational logic minimization technique with applications + * to cryptology" (https://eprint.iacr.org/2009/191.pdf). + * + * Note that variables x* (input) and s* (output) are numbered + * in "reverse" order (x0 is the high bit, x7 is the low bit). + */ + + uint64_t x0, x1, x2, x3, x4, x5, x6, x7; + uint64_t y1, y2, y3, y4, y5, y6, y7, y8, y9; + uint64_t y10, y11, y12, y13, y14, y15, y16, y17, y18, y19; + uint64_t y20, y21; + uint64_t z0, z1, z2, z3, z4, z5, z6, z7, z8, z9; + uint64_t z10, z11, z12, z13, z14, z15, z16, z17; + uint64_t t0, t1, t2, t3, t4, t5, t6, t7, t8, t9; + uint64_t t10, t11, t12, t13, t14, t15, t16, t17, t18, t19; + uint64_t t20, t21, t22, t23, t24, t25, t26, t27, t28, t29; + uint64_t t30, t31, t32, t33, t34, t35, t36, t37, t38, t39; + uint64_t t40, t41, t42, t43, t44, t45, t46, t47, t48, t49; + uint64_t t50, t51, t52, t53, t54, t55, t56, t57, t58, t59; + uint64_t t60, t61, t62, t63, t64, t65, t66, t67; + uint64_t s0, s1, s2, s3, s4, s5, s6, s7; + + x0 = q[7]; + x1 = q[6]; + x2 = q[5]; + x3 = q[4]; + x4 = q[3]; + x5 = q[2]; + x6 = q[1]; + x7 = q[0]; + + /* + * Top linear transformation. + */ + y14 = x3 ^ x5; + y13 = x0 ^ x6; + y9 = x0 ^ x3; + y8 = x0 ^ x5; + t0 = x1 ^ x2; + y1 = t0 ^ x7; + y4 = y1 ^ x3; + y12 = y13 ^ y14; + y2 = y1 ^ x0; + y5 = y1 ^ x6; + y3 = y5 ^ y8; + t1 = x4 ^ y12; + y15 = t1 ^ x5; + y20 = t1 ^ x1; + y6 = y15 ^ x7; + y10 = y15 ^ t0; + y11 = y20 ^ y9; + y7 = x7 ^ y11; + y17 = y10 ^ y11; + y19 = y10 ^ y8; + y16 = t0 ^ y11; + y21 = y13 ^ y16; + y18 = x0 ^ y16; + + /* + * Non-linear section. + */ + t2 = y12 & y15; + t3 = y3 & y6; + t4 = t3 ^ t2; + t5 = y4 & x7; + t6 = t5 ^ t2; + t7 = y13 & y16; + t8 = y5 & y1; + t9 = t8 ^ t7; + t10 = y2 & y7; + t11 = t10 ^ t7; + t12 = y9 & y11; + t13 = y14 & y17; + t14 = t13 ^ t12; + t15 = y8 & y10; + t16 = t15 ^ t12; + t17 = t4 ^ t14; + t18 = t6 ^ t16; + t19 = t9 ^ t14; + t20 = t11 ^ t16; + t21 = t17 ^ y20; + t22 = t18 ^ y19; + t23 = t19 ^ y21; + t24 = t20 ^ y18; + + t25 = t21 ^ t22; + t26 = t21 & t23; + t27 = t24 ^ t26; + t28 = t25 & t27; + t29 = t28 ^ t22; + t30 = t23 ^ t24; + t31 = t22 ^ t26; + t32 = t31 & t30; + t33 = t32 ^ t24; + t34 = t23 ^ t33; + t35 = t27 ^ t33; + t36 = t24 & t35; + t37 = t36 ^ t34; + t38 = t27 ^ t36; + t39 = t29 & t38; + t40 = t25 ^ t39; + + t41 = t40 ^ t37; + t42 = t29 ^ t33; + t43 = t29 ^ t40; + t44 = t33 ^ t37; + t45 = t42 ^ t41; + z0 = t44 & y15; + z1 = t37 & y6; + z2 = t33 & x7; + z3 = t43 & y16; + z4 = t40 & y1; + z5 = t29 & y7; + z6 = t42 & y11; + z7 = t45 & y17; + z8 = t41 & y10; + z9 = t44 & y12; + z10 = t37 & y3; + z11 = t33 & y4; + z12 = t43 & y13; + z13 = t40 & y5; + z14 = t29 & y2; + z15 = t42 & y9; + z16 = t45 & y14; + z17 = t41 & y8; + + /* + * Bottom linear transformation. + */ + t46 = z15 ^ z16; + t47 = z10 ^ z11; + t48 = z5 ^ z13; + t49 = z9 ^ z10; + t50 = z2 ^ z12; + t51 = z2 ^ z5; + t52 = z7 ^ z8; + t53 = z0 ^ z3; + t54 = z6 ^ z7; + t55 = z16 ^ z17; + t56 = z12 ^ t48; + t57 = t50 ^ t53; + t58 = z4 ^ t46; + t59 = z3 ^ t54; + t60 = t46 ^ t57; + t61 = z14 ^ t57; + t62 = t52 ^ t58; + t63 = t49 ^ t58; + t64 = z4 ^ t59; + t65 = t61 ^ t62; + t66 = z1 ^ t63; + s0 = t59 ^ t63; + s6 = t56 ^ ~t62; + s7 = t48 ^ ~t60; + t67 = t64 ^ t65; + s3 = t53 ^ t66; + s4 = t51 ^ t66; + s5 = t47 ^ t65; + s1 = t64 ^ ~s3; + s2 = t55 ^ ~t67; + + q[7] = s0; + q[6] = s1; + q[5] = s2; + q[4] = s3; + q[3] = s4; + q[2] = s5; + q[1] = s6; + q[0] = s7; +} + +/* see inner.h */ +void +br_aes_ct64_ortho(uint64_t *q) +{ +#define SWAPN(cl, ch, s, x, y) do { \ + uint64_t a, b; \ + a = (x); \ + b = (y); \ + (x) = (a & (uint64_t)cl) | ((b & (uint64_t)cl) << (s)); \ + (y) = ((a & (uint64_t)ch) >> (s)) | (b & (uint64_t)ch); \ + } while (0) + +#define SWAP2(x, y) SWAPN(0x5555555555555555, 0xAAAAAAAAAAAAAAAA, 1, x, y) +#define SWAP4(x, y) SWAPN(0x3333333333333333, 0xCCCCCCCCCCCCCCCC, 2, x, y) +#define SWAP8(x, y) SWAPN(0x0F0F0F0F0F0F0F0F, 0xF0F0F0F0F0F0F0F0, 4, x, y) + + SWAP2(q[0], q[1]); + SWAP2(q[2], q[3]); + SWAP2(q[4], q[5]); + SWAP2(q[6], q[7]); + + SWAP4(q[0], q[2]); + SWAP4(q[1], q[3]); + SWAP4(q[4], q[6]); + SWAP4(q[5], q[7]); + + SWAP8(q[0], q[4]); + SWAP8(q[1], q[5]); + SWAP8(q[2], q[6]); + SWAP8(q[3], q[7]); +} + +/* see inner.h */ +void +br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w) +{ + uint64_t x0, x1, x2, x3; + + x0 = w[0]; + x1 = w[1]; + x2 = w[2]; + x3 = w[3]; + x0 |= (x0 << 16); + x1 |= (x1 << 16); + x2 |= (x2 << 16); + x3 |= (x3 << 16); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + x0 |= (x0 << 8); + x1 |= (x1 << 8); + x2 |= (x2 << 8); + x3 |= (x3 << 8); + x0 &= (uint64_t)0x00FF00FF00FF00FF; + x1 &= (uint64_t)0x00FF00FF00FF00FF; + x2 &= (uint64_t)0x00FF00FF00FF00FF; + x3 &= (uint64_t)0x00FF00FF00FF00FF; + *q0 = x0 | (x2 << 8); + *q1 = x1 | (x3 << 8); +} + +/* see inner.h */ +void +br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1) +{ + uint64_t x0, x1, x2, x3; + + x0 = q0 & (uint64_t)0x00FF00FF00FF00FF; + x1 = q1 & (uint64_t)0x00FF00FF00FF00FF; + x2 = (q0 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x3 = (q1 >> 8) & (uint64_t)0x00FF00FF00FF00FF; + x0 |= (x0 >> 8); + x1 |= (x1 >> 8); + x2 |= (x2 >> 8); + x3 |= (x3 >> 8); + x0 &= (uint64_t)0x0000FFFF0000FFFF; + x1 &= (uint64_t)0x0000FFFF0000FFFF; + x2 &= (uint64_t)0x0000FFFF0000FFFF; + x3 &= (uint64_t)0x0000FFFF0000FFFF; + w[0] = (uint32_t)x0 | (uint32_t)(x0 >> 16); + w[1] = (uint32_t)x1 | (uint32_t)(x1 >> 16); + w[2] = (uint32_t)x2 | (uint32_t)(x2 >> 16); + w[3] = (uint32_t)x3 | (uint32_t)(x3 >> 16); +} + +static const unsigned char Rcon[] = { + 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x1B, 0x36 +}; + +static uint32_t +sub_word(uint32_t x) +{ + uint64_t q[8]; + + memset(q, 0, sizeof q); + q[0] = x; + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_Sbox(q); + br_aes_ct64_ortho(q); + return (uint32_t)q[0]; +} + +/* see inner.h */ +unsigned +br_aes_ct64_keysched(uint64_t *comp_skey, const void *key, size_t key_len) +{ + unsigned num_rounds; + int i, j, k, nk, nkf; + uint32_t tmp; + uint32_t skey[60]; + + switch (key_len) { + case 16: + num_rounds = 10; + break; + case 24: + num_rounds = 12; + break; + case 32: + num_rounds = 14; + break; + default: + /* abort(); */ + return 0; + } + nk = (int)(key_len >> 2); + nkf = (int)((num_rounds + 1) << 2); + br_range_dec32le(skey, (key_len >> 2), key); + tmp = skey[(key_len >> 2) - 1]; + for (i = nk, j = 0, k = 0; i < nkf; i ++) { + if (j == 0) { + tmp = (tmp << 24) | (tmp >> 8); + tmp = sub_word(tmp) ^ Rcon[k]; + } else if (nk > 6 && j == 4) { + tmp = sub_word(tmp); + } + tmp ^= skey[i - nk]; + skey[i] = tmp; + if (++ j == nk) { + j = 0; + k ++; + } + } + + for (i = 0, j = 0; i < nkf; i += 4, j += 2) { + uint64_t q[8]; + + br_aes_ct64_interleave_in(&q[0], &q[4], skey + i); + q[1] = q[0]; + q[2] = q[0]; + q[3] = q[0]; + q[5] = q[4]; + q[6] = q[4]; + q[7] = q[4]; + br_aes_ct64_ortho(q); + comp_skey[j + 0] = + (q[0] & (uint64_t)0x1111111111111111) + | (q[1] & (uint64_t)0x2222222222222222) + | (q[2] & (uint64_t)0x4444444444444444) + | (q[3] & (uint64_t)0x8888888888888888); + comp_skey[j + 1] = + (q[4] & (uint64_t)0x1111111111111111) + | (q[5] & (uint64_t)0x2222222222222222) + | (q[6] & (uint64_t)0x4444444444444444) + | (q[7] & (uint64_t)0x8888888888888888); + } + return num_rounds; +} + +/* see inner.h */ +void +br_aes_ct64_skey_expand(uint64_t *skey, + unsigned num_rounds, const uint64_t *comp_skey) +{ + unsigned u, v, n; + + n = (num_rounds + 1) << 1; + for (u = 0, v = 0; u < n; u ++, v += 4) { + uint64_t x0, x1, x2, x3; + + x0 = x1 = x2 = x3 = comp_skey[u]; + x0 &= (uint64_t)0x1111111111111111; + x1 &= (uint64_t)0x2222222222222222; + x2 &= (uint64_t)0x4444444444444444; + x3 &= (uint64_t)0x8888888888888888; + x1 >>= 1; + x2 >>= 2; + x3 >>= 3; + skey[v + 0] = (x0 << 4) - x0; + skey[v + 1] = (x1 << 4) - x1; + skey[v + 2] = (x2 << 4) - x2; + skey[v + 3] = (x3 << 4) - x3; + } +} diff --git a/third-party/bearssl/aes_ct64_ctr.c b/third-party/bearssl/aes_ct64_ctr.c new file mode 100644 index 00000000..2decdf5f --- /dev/null +++ b/third-party/bearssl/aes_ct64_ctr.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "bearssl_block.h" +#include "inner.h" + +/* see bearssl_block.h */ +void +br_aes_ct64_ctr_init(br_aes_ct64_ctr_keys *ctx, + const void *key, size_t len) +{ + ctx->vtable = &br_aes_ct64_ctr_vtable; + ctx->num_rounds = br_aes_ct64_keysched(ctx->skey, key, len); +} + +static void +xorbuf(void *dst, const void *src, size_t len) +{ + unsigned char *d; + const unsigned char *s; + + d = dst; + s = src; + while (len -- > 0) { + *d ++ ^= *s ++; + } +} + +/* see bearssl_block.h */ +uint32_t +br_aes_ct64_ctr_run(const br_aes_ct64_ctr_keys *ctx, + const void *iv, uint32_t cc, void *data, size_t len) +{ + unsigned char *buf; + uint32_t ivw[16]; + uint64_t sk_exp[120]; + + br_aes_ct64_skey_expand(sk_exp, ctx->num_rounds, ctx->skey); + br_range_dec32le(ivw, 3, iv); + memcpy(ivw + 4, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 8, ivw, 3 * sizeof(uint32_t)); + memcpy(ivw + 12, ivw, 3 * sizeof(uint32_t)); + buf = data; + while (len > 0) { + uint64_t q[8]; + uint32_t w[16]; + unsigned char tmp[64]; + int i; + + /* + * TODO: see if we can save on the first br_aes_ct64_ortho() + * call, since iv0/iv1/iv2 are constant for the whole run. + */ + memcpy(w, ivw, sizeof ivw); + w[3] = br_swap32(cc); + w[7] = br_swap32(cc + 1); + w[11] = br_swap32(cc + 2); + w[15] = br_swap32(cc + 3); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_in( + &q[i], &q[i + 4], w + (i << 2)); + } + br_aes_ct64_ortho(q); + br_aes_ct64_bitslice_encrypt(ctx->num_rounds, sk_exp, q); + br_aes_ct64_ortho(q); + for (i = 0; i < 4; i ++) { + br_aes_ct64_interleave_out( + w + (i << 2), q[i], q[i + 4]); + } + br_range_enc32le(tmp, w, 16); + if (len <= 64) { + xorbuf(buf, tmp, len); + cc += (uint32_t)len >> 4; + break; + } + xorbuf(buf, tmp, 64); + buf += 64; + len -= 64; + cc += 4; + } + return cc; +} + +/* see bearssl_block.h */ +const br_block_ctr_class br_aes_ct64_ctr_vtable = { + sizeof(br_aes_ct64_ctr_keys), + 16, + 4, + (void (*)(const br_block_ctr_class **, const void *, size_t)) + &br_aes_ct64_ctr_init, + (uint32_t (*)(const br_block_ctr_class *const *, + const void *, uint32_t, void *, size_t)) + &br_aes_ct64_ctr_run +}; diff --git a/third-party/bearssl/aes_ct64_enc.c b/third-party/bearssl/aes_ct64_enc.c new file mode 100644 index 00000000..152fee4e --- /dev/null +++ b/third-party/bearssl/aes_ct64_enc.c @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "inner.h" + +static inline void +add_round_key(uint64_t *q, const uint64_t *sk) +{ + q[0] ^= sk[0]; + q[1] ^= sk[1]; + q[2] ^= sk[2]; + q[3] ^= sk[3]; + q[4] ^= sk[4]; + q[5] ^= sk[5]; + q[6] ^= sk[6]; + q[7] ^= sk[7]; +} + +static inline void +shift_rows(uint64_t *q) +{ + int i; + + for (i = 0; i < 8; i ++) { + uint64_t x; + + x = q[i]; + q[i] = (x & (uint64_t)0x000000000000FFFF) + | ((x & (uint64_t)0x00000000FFF00000) >> 4) + | ((x & (uint64_t)0x00000000000F0000) << 12) + | ((x & (uint64_t)0x0000FF0000000000) >> 8) + | ((x & (uint64_t)0x000000FF00000000) << 8) + | ((x & (uint64_t)0xF000000000000000) >> 12) + | ((x & (uint64_t)0x0FFF000000000000) << 4); + } +} + +static inline uint64_t +rotr32(uint64_t x) +{ + return (x << 32) | (x >> 32); +} + +static inline void +mix_columns(uint64_t *q) +{ + uint64_t q0, q1, q2, q3, q4, q5, q6, q7; + uint64_t r0, r1, r2, r3, r4, r5, r6, r7; + + q0 = q[0]; + q1 = q[1]; + q2 = q[2]; + q3 = q[3]; + q4 = q[4]; + q5 = q[5]; + q6 = q[6]; + q7 = q[7]; + r0 = (q0 >> 16) | (q0 << 48); + r1 = (q1 >> 16) | (q1 << 48); + r2 = (q2 >> 16) | (q2 << 48); + r3 = (q3 >> 16) | (q3 << 48); + r4 = (q4 >> 16) | (q4 << 48); + r5 = (q5 >> 16) | (q5 << 48); + r6 = (q6 >> 16) | (q6 << 48); + r7 = (q7 >> 16) | (q7 << 48); + + q[0] = q7 ^ r7 ^ r0 ^ rotr32(q0 ^ r0); + q[1] = q0 ^ r0 ^ q7 ^ r7 ^ r1 ^ rotr32(q1 ^ r1); + q[2] = q1 ^ r1 ^ r2 ^ rotr32(q2 ^ r2); + q[3] = q2 ^ r2 ^ q7 ^ r7 ^ r3 ^ rotr32(q3 ^ r3); + q[4] = q3 ^ r3 ^ q7 ^ r7 ^ r4 ^ rotr32(q4 ^ r4); + q[5] = q4 ^ r4 ^ r5 ^ rotr32(q5 ^ r5); + q[6] = q5 ^ r5 ^ r6 ^ rotr32(q6 ^ r6); + q[7] = q6 ^ r6 ^ r7 ^ rotr32(q7 ^ r7); +} + +/* see inner.h */ +void +br_aes_ct64_bitslice_encrypt(unsigned num_rounds, + const uint64_t *skey, uint64_t *q) +{ + unsigned u; + + add_round_key(q, skey); + for (u = 1; u < num_rounds; u ++) { + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + mix_columns(q); + add_round_key(q, skey + (u << 3)); + } + br_aes_ct64_bitslice_Sbox(q); + shift_rows(q); + add_round_key(q, skey + (num_rounds << 3)); +} diff --git a/third-party/bearssl/bearssl_aead.h b/third-party/bearssl/bearssl_aead.h new file mode 100644 index 00000000..388d93b7 --- /dev/null +++ b/third-party/bearssl/bearssl_aead.h @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef BR_BEARSSL_AEAD_H__ +#define BR_BEARSSL_AEAD_H__ + +#include "bearssl_block.h" +#include "bearssl_hash.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \brief Class type of an AEAD algorithm. + */ +typedef struct br_aead_class_ br_aead_class; +struct br_aead_class_ { + + /** + * \brief Size (in bytes) of authentication tags created by + * this AEAD algorithm. + */ + size_t tag_size; + + /** + * \brief Reset an AEAD context. + * + * This function resets an already initialised AEAD context for + * a new computation run. Implementations and keys are + * conserved. This function can be called at any time; it + * cancels any ongoing AEAD computation that uses the provided + * context structure. + + * The provided IV is a _nonce_. Each AEAD algorithm has its + * own requirements on IV size and contents; for most of them, + * it is crucial to security that each nonce value is used + * only once for a given secret key. + * + * \param cc AEAD context structure. + * \param iv AEAD nonce to use. + * \param len AEAD nonce length (in bytes). + */ + void (*reset)(const br_aead_class **cc, const void *iv, size_t len); + + /** + * \brief Inject additional authenticated data. + * + * The provided data is injected into a running AEAD + * computation. Additional data must be injected _before_ the + * call to `flip()`. Additional data can be injected in several + * chunks of arbitrary length. + * + * \param cc AEAD context structure. + * \param data pointer to additional authenticated data. + * \param len length of additional authenticated data (in bytes). + */ + void (*aad_inject)(const br_aead_class **cc, + const void *data, size_t len); + + /** + * \brief Finish injection of additional authenticated data. + * + * This function MUST be called before beginning the actual + * encryption or decryption (with `run()`), even if no + * additional authenticated data was injected. No additional + * authenticated data may be injected after this function call. + * + * \param cc AEAD context structure. + */ + void (*flip)(const br_aead_class **cc); + + /** + * \brief Encrypt or decrypt some data. + * + * Data encryption or decryption can be done after `flip()` has + * been called on the context. If `encrypt` is non-zero, then + * the provided data shall be plaintext, and it is encrypted in + * place. Otherwise, the data shall be ciphertext, and it is + * decrypted in place. + * + * Data may be provided in several chunks of arbitrary length. + * + * \param cc AEAD context structure. + * \param encrypt non-zero for encryption, zero for decryption. + * \param data data to encrypt or decrypt. + * \param len data length (in bytes). + */ + void (*run)(const br_aead_class **cc, int encrypt, + void *data, size_t len); + + /** + * \brief Compute authentication tag. + * + * Compute the AEAD authentication tag. The tag length depends + * on the AEAD algorithm; it is written in the provided `tag` + * buffer. This call terminates the AEAD run: no data may be + * processed with that AEAD context afterwards, until `reset()` + * is called to initiate a new AEAD run. + * + * The tag value must normally be sent along with the encrypted + * data. When decrypting, the tag value must be recomputed and + * compared with the received tag: if the two tag values differ, + * then either the tag or the encrypted data was altered in + * transit. As an alternative to this function, the + * `check_tag()` function may be used to compute and check the + * tag value. + * + * Tag length depends on the AEAD algorithm. + * + * \param cc AEAD context structure. + * \param tag destination buffer for the tag. + */ + void (*get_tag)(const br_aead_class **cc, void *tag); + + /** + * \brief Compute and check authentication tag. + * + * This function is an alternative to `get_tag()`, and is + * normally used on the receiving end (i.e. when decrypting + * messages). The tag value is recomputed and compared with the + * provided tag value. If they match, 1 is returned; on + * mismatch, 0 is returned. A returned value of 0 means that the + * data or the tag was altered in transit, normally leading to + * wholesale rejection of the complete message. + * + * Tag length depends on the AEAD algorithm. + * + * \param cc AEAD context structure. + * \param tag tag value to compare with. + * \return 1 on success (exact match of tag value), 0 otherwise. + */ + uint32_t (*check_tag)(const br_aead_class **cc, const void *tag); + + /** + * \brief Compute authentication tag (with truncation). + * + * This function is similar to `get_tag()`, except that the tag + * length is provided. Some AEAD algorithms allow several tag + * lengths, usually by truncating the normal tag. Shorter tags + * mechanically increase success probability of forgeries. + * The range of allowed tag lengths depends on the algorithm. + * + * \param cc AEAD context structure. + * \param tag destination buffer for the tag. + * \param len tag length (in bytes). + */ + void (*get_tag_trunc)(const br_aead_class **cc, void *tag, size_t len); + + /** + * \brief Compute and check authentication tag (with truncation). + * + * This function is similar to `check_tag()` except that it + * works over an explicit tag length. See `get_tag()` for a + * discussion of explicit tag lengths; the range of allowed tag + * lengths depends on the algorithm. + * + * \param cc AEAD context structure. + * \param tag tag value to compare with. + * \param len tag length (in bytes). + * \return 1 on success (exact match of tag value), 0 otherwise. + */ + uint32_t (*check_tag_trunc)(const br_aead_class **cc, + const void *tag, size_t len); +}; + +/** + * \brief Context structure for GCM. + * + * GCM is an AEAD mode that combines a block cipher in CTR mode with a + * MAC based on GHASH, to provide authenticated encryption: + * + * - Any block cipher with 16-byte blocks can be used with GCM. + * + * - The nonce can have any length, from 0 up to 2^64-1 bits; however, + * 96-bit nonces (12 bytes) are recommended (nonces with a length + * distinct from 12 bytes are internally hashed, which risks reusing + * nonce value with a small but not always negligible probability). + * + * - Additional authenticated data may have length up to 2^64-1 bits. + * + * - Message length may range up to 2^39-256 bits at most. + * + * - The authentication tag has length 16 bytes. + * + * The GCM initialisation function receives as parameter an + * _initialised_ block cipher implementation context, with the secret + * key already set. A pointer to that context will be kept within the + * GCM context structure. It is up to the caller to allocate and + * initialise that block cipher context. + */ +typedef struct { + /** \brief Pointer to vtable for this context. */ + const br_aead_class *vtable; + + const br_block_ctr_class **bctx; + br_ghash gh; + unsigned char h[16]; + unsigned char j0_1[12]; + unsigned char buf[16]; + unsigned char y[16]; + uint32_t j0_2, jc; + uint64_t count_aad, count_ctr; +} br_gcm_context; + +/** + * \brief Initialize a GCM context. + * + * A block cipher implementation, with its initialised context structure, + * is provided. The block cipher MUST use 16-byte blocks in CTR mode, + * and its secret key MUST have been already set in the provided context. + * A GHASH implementation must also be provided. The parameters are linked + * in the GCM context. + * + * After this function has been called, the `br_gcm_reset()` function must + * be called, to provide the IV for GCM computation. + * + * \param ctx GCM context structure. + * \param bctx block cipher context (already initialised with secret key). + * \param gh GHASH implementation. + */ +void br_gcm_init(br_gcm_context *ctx, + const br_block_ctr_class **bctx, br_ghash gh); + +/** + * \brief Reset a GCM context. + * + * This function resets an already initialised GCM context for a new + * computation run. Implementations and keys are conserved. This function + * can be called at any time; it cancels any ongoing GCM computation that + * uses the provided context structure. + * + * The provided IV is a _nonce_. It is critical to GCM security that IV + * values are not repeated for the same encryption key. IV can have + * arbitrary length (up to 2^64-1 bits), but the "normal" length is + * 96 bits (12 bytes). + * + * \param ctx GCM context structure. + * \param iv GCM nonce to use. + * \param len GCM nonce length (in bytes). + */ +void br_gcm_reset(br_gcm_context *ctx, const void *iv, size_t len); + +/** + * \brief Inject additional authenticated data into GCM. + * + * The provided data is injected into a running GCM computation. Additional + * data must be injected _before_ the call to `br_gcm_flip()`. + * Additional data can be injected in several chunks of arbitrary length; + * the maximum total size of additional authenticated data is 2^64-1 + * bits. + * + * \param ctx GCM context structure. + * \param data pointer to additional authenticated data. + * \param len length of additional authenticated data (in bytes). + */ +void br_gcm_aad_inject(br_gcm_context *ctx, const void *data, size_t len); + +/** + * \brief Finish injection of additional authenticated data into GCM. + * + * This function MUST be called before beginning the actual encryption + * or decryption (with `br_gcm_run()`), even if no additional authenticated + * data was injected. No additional authenticated data may be injected + * after this function call. + * + * \param ctx GCM context structure. + */ +void br_gcm_flip(br_gcm_context *ctx); + +/** + * \brief Encrypt or decrypt some data with GCM. + * + * Data encryption or decryption can be done after `br_gcm_flip()` + * has been called on the context. If `encrypt` is non-zero, then the + * provided data shall be plaintext, and it is encrypted in place. + * Otherwise, the data shall be ciphertext, and it is decrypted in place. + * + * Data may be provided in several chunks of arbitrary length. The maximum + * total length for data is 2^39-256 bits, i.e. about 65 gigabytes. + * + * \param ctx GCM context structure. + * \param encrypt non-zero for encryption, zero for decryption. + * \param data data to encrypt or decrypt. + * \param len data length (in bytes). + */ +void br_gcm_run(br_gcm_context *ctx, int encrypt, void *data, size_t len); + +/** + * \brief Compute GCM authentication tag. + * + * Compute the GCM authentication tag. The tag is a 16-byte value which + * is written in the provided `tag` buffer. This call terminates the + * GCM run: no data may be processed with that GCM context afterwards, + * until `br_gcm_reset()` is called to initiate a new GCM run. + * + * The tag value must normally be sent along with the encrypted data. + * When decrypting, the tag value must be recomputed and compared with + * the received tag: if the two tag values differ, then either the tag + * or the encrypted data was altered in transit. As an alternative to + * this function, the `br_gcm_check_tag()` function can be used to + * compute and check the tag value. + * + * \param ctx GCM context structure. + * \param tag destination buffer for the tag (16 bytes). + */ +void br_gcm_get_tag(br_gcm_context *ctx, void *tag); + +/** + * \brief Compute and check GCM authentication tag. + * + * This function is an alternative to `br_gcm_get_tag()`, normally used + * on the receiving end (i.e. when decrypting value). The tag value is + * recomputed and compared with the provided tag value. If they match, 1 + * is returned; on mismatch, 0 is returned. A returned value of 0 means + * that the data or the tag was altered in transit, normally leading to + * wholesale rejection of the complete message. + * + * \param ctx GCM context structure. + * \param tag tag value to compare with (16 bytes). + * \return 1 on success (exact match of tag value), 0 otherwise. + */ +uint32_t br_gcm_check_tag(br_gcm_context *ctx, const void *tag); + +/** + * \brief Compute GCM authentication tag (with truncation). + * + * This function is similar to `br_gcm_get_tag()`, except that it allows + * the tag to be truncated to a smaller length. The intended tag length + * is provided as `len` (in bytes); it MUST be no more than 16, but + * it may be smaller. Note that decreasing tag length mechanically makes + * forgeries easier; NIST SP 800-38D specifies that the tag length shall + * lie between 12 and 16 bytes (inclusive), but may be truncated down to + * 4 or 8 bytes, for specific applications that can tolerate it. It must + * also be noted that successful forgeries leak information on the + * authentication key, making subsequent forgeries easier. Therefore, + * tag truncation, and in particular truncation to sizes lower than 12 + * bytes, shall be envisioned only with great care. + * + * The tag is written in the provided `tag` buffer. This call terminates + * the GCM run: no data may be processed with that GCM context + * afterwards, until `br_gcm_reset()` is called to initiate a new GCM + * run. + * + * The tag value must normally be sent along with the encrypted data. + * When decrypting, the tag value must be recomputed and compared with + * the received tag: if the two tag values differ, then either the tag + * or the encrypted data was altered in transit. As an alternative to + * this function, the `br_gcm_check_tag_trunc()` function can be used to + * compute and check the tag value. + * + * \param ctx GCM context structure. + * \param tag destination buffer for the tag. + * \param len tag length (16 bytes or less). + */ +void br_gcm_get_tag_trunc(br_gcm_context *ctx, void *tag, size_t len); + +/** + * \brief Compute and check GCM authentication tag (with truncation). + * + * This function is an alternative to `br_gcm_get_tag_trunc()`, normally used + * on the receiving end (i.e. when decrypting value). The tag value is + * recomputed and compared with the provided tag value. If they match, 1 + * is returned; on mismatch, 0 is returned. A returned value of 0 means + * that the data or the tag was altered in transit, normally leading to + * wholesale rejection of the complete message. + * + * Tag length MUST be 16 bytes or less. The normal GCM tag length is 16 + * bytes. See `br_check_tag_trunc()` for some discussion on the potential + * perils of truncating authentication tags. + * + * \param ctx GCM context structure. + * \param tag tag value to compare with. + * \param len tag length (in bytes). + * \return 1 on success (exact match of tag value), 0 otherwise. + */ +uint32_t br_gcm_check_tag_trunc(br_gcm_context *ctx, + const void *tag, size_t len); + +/** + * \brief Class instance for GCM. + */ +extern const br_aead_class br_gcm_vtable; + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/third-party/bearssl/bearssl_block.h b/third-party/bearssl/bearssl_block.h new file mode 100644 index 00000000..c5e7c0af --- /dev/null +++ b/third-party/bearssl/bearssl_block.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef BR_BEARSSL_BLOCK_H__ +#define BR_BEARSSL_BLOCK_H__ + +#include + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \brief Class type for CTR encryption/decryption implementations. + * + * A `br_block_ctr_class` instance points to the functions implementing + * a specific block cipher, when used in CTR mode for encrypting or + * decrypting data. + */ +typedef struct br_block_ctr_class_ br_block_ctr_class; +struct br_block_ctr_class_ { + /** + * \brief Size (in bytes) of the context structure appropriate + * for containing subkeys. + */ + size_t context_size; + + /** + * \brief Size of individual blocks (in bytes). + */ + unsigned block_size; + + /** + * \brief Base-2 logarithm of the size of individual blocks, + * expressed in bytes. + */ + unsigned log_block_size; + + /** + * \brief Initialisation function. + * + * This function sets the `vtable` field in the context structure. + * The key length MUST be one of the key lengths supported by + * the implementation. + * + * \param ctx context structure to initialise. + * \param key secret key. + * \param key_len key length (in bytes). + */ + void (*init)(const br_block_ctr_class **ctx, + const void *key, size_t key_len); + + /** + * \brief Run the CTR encryption or decryption. + * + * The `iv` parameter points to the IV for this run; its + * length is exactly 4 bytes less than the block size (e.g. + * 12 bytes for AES/CTR). The IV is combined with a 32-bit + * block counter to produce the block value which is processed + * with the block cipher. + * + * The data to encrypt or decrypt is updated "in place". Its + * length (`len` bytes) is not required to be a multiple of + * the block size; if the final block is partial, then the + * corresponding key stream bits are dropped. + * + * The resulting counter value is returned. + * + * \param ctx context structure (already initialised). + * \param iv IV for CTR encryption/decryption. + * \param cc initial value for the block counter. + * \param data data to encrypt or decrypt. + * \param len data length (in bytes). + * \return the new block counter value. + */ + uint32_t (*run)(const br_block_ctr_class *const *ctx, + const void *iv, uint32_t cc, void *data, size_t len); +}; + +/** \brief AES block size (16 bytes). */ +#define br_aes_ct64_BLOCK_SIZE 16 + +/** + * \brief Context for AES subkeys (`aes_ct64` implementation, CTR encryption + * and decryption). + * + * First field is a pointer to the vtable; it is set by the initialisation + * function. Other fields are not supposed to be accessed by user code. + */ +typedef struct { + /** \brief Pointer to vtable for this context. */ + const br_block_ctr_class *vtable; + uint64_t skey[30]; + unsigned num_rounds; +} br_aes_ct64_ctr_keys; + +/** + * \brief Class instance for AES CTR encryption and decryption + * (`aes_ct64` implementation). + */ +extern const br_block_ctr_class br_aes_ct64_ctr_vtable; + +/* + * 64-bit constant-time AES implementation. It is similar to 'aes_ct' + * but uses 64-bit registers, making it about twice faster than 'aes_ct' + * on 64-bit platforms, while remaining constant-time and with a similar + * code size. (The doubling in performance is only for CBC decryption + * and CTR mode; CBC encryption is non-parallel and cannot benefit from + * the larger registers.) + */ + +/** + * \brief Context initialisation (key schedule) for AES CTR encryption + * and decryption (`aes_ct64` implementation). + * + * \param ctx context to initialise. + * \param key secret key. + * \param len secret key length (in bytes). + */ +void br_aes_ct64_ctr_init(br_aes_ct64_ctr_keys *ctx, + const void *key, size_t len); + +/** + * \brief CTR encryption and decryption with AES (`aes_ct64` implementation). + * + * \param ctx context (already initialised). + * \param iv IV (constant, 12 bytes). + * \param cc initial block counter value. + * \param data data to decrypt (updated). + * \param len data length (in bytes). + * \return new block counter value. + */ +uint32_t br_aes_ct64_ctr_run(const br_aes_ct64_ctr_keys *ctx, + const void *iv, uint32_t cc, void *data, size_t len); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/third-party/bearssl/bearssl_hash.h b/third-party/bearssl/bearssl_hash.h new file mode 100644 index 00000000..ed1d28dc --- /dev/null +++ b/third-party/bearssl/bearssl_hash.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef BR_BEARSSL_HASH_H__ +#define BR_BEARSSL_HASH_H__ + +#ifdef __cplusplus +extern "C" { +#endif + +/** + * \brief Type for a GHASH implementation. + * + * GHASH is a sort of keyed hash meant to be used to implement GCM in + * combination with a block cipher (with 16-byte blocks). + * + * The `y` array has length 16 bytes and is used for input and output; in + * a complete GHASH run, it starts with an all-zero value. `h` is a 16-byte + * value that serves as key (it is derived from the encryption key in GCM, + * using the block cipher). The data length (`len`) is expressed in bytes. + * The `y` array is updated. + * + * If the data length is not a multiple of 16, then the data is implicitly + * padded with zeros up to the next multiple of 16. Thus, when using GHASH + * in GCM, this method may be called twice, for the associated data and + * for the ciphertext, respectively; the zero-padding implements exactly + * the GCM rules. + * + * \param y the array to update. + * \param h the GHASH key. + * \param data the input data (may be `NULL` if `len` is zero). + * \param len the input data length (in bytes). + */ +typedef void (*br_ghash)(void *y, const void *h, const void *data, size_t len); + +/** + * \brief GHASH implementation using multiplications (64-bit). + * + * This implementation uses multiplications of 64-bit values, with a + * 64-bit result. It is constant-time (if multiplications are + * constant-time). It is substantially faster than `br_ghash_ctmul()` + * and `br_ghash_ctmul32()` on most 64-bit architectures. + * + * \param y the array to update. + * \param h the GHASH key. + * \param data the input data (may be `NULL` if `len` is zero). + * \param len the input data length (in bytes). + */ +void br_ghash_ctmul64(void *y, const void *h, const void *data, size_t len); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/third-party/bearssl/config.cmake b/third-party/bearssl/config.cmake new file mode 100644 index 00000000..1763578b --- /dev/null +++ b/third-party/bearssl/config.cmake @@ -0,0 +1,9 @@ +set(SOURCES_bearssl + ${PROJECT_SOURCE_DIR}/third-party/bearssl/enc32le.c + ${PROJECT_SOURCE_DIR}/third-party/bearssl/dec32le.c + ${PROJECT_SOURCE_DIR}/third-party/bearssl/aes_ct64.c + ${PROJECT_SOURCE_DIR}/third-party/bearssl/aes_ct64_enc.c + ${PROJECT_SOURCE_DIR}/third-party/bearssl/aes_ct64_ctr.c + ${PROJECT_SOURCE_DIR}/third-party/bearssl/ghash_ctmul64.c + ${PROJECT_SOURCE_DIR}/third-party/bearssl/gcm.c +) diff --git a/third-party/bearssl/dec32le.c b/third-party/bearssl/dec32le.c new file mode 100644 index 00000000..122a8782 --- /dev/null +++ b/third-party/bearssl/dec32le.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "inner.h" + +/* see inner.h */ +void +br_range_dec32le(uint32_t *v, size_t num, const void *src) +{ + const unsigned char *buf; + + buf = src; + while (num -- > 0) { + *v ++ = br_dec32le(buf); + buf += 4; + } +} diff --git a/third-party/bearssl/enc32le.c b/third-party/bearssl/enc32le.c new file mode 100644 index 00000000..d4e726f8 --- /dev/null +++ b/third-party/bearssl/enc32le.c @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "inner.h" + +/* see inner.h */ +void +br_range_enc32le(void *dst, const uint32_t *v, size_t num) +{ + unsigned char *buf; + + buf = dst; + while (num -- > 0) { + br_enc32le(buf, *v ++); + buf += 4; + } +} diff --git a/third-party/bearssl/gcm.c b/third-party/bearssl/gcm.c new file mode 100644 index 00000000..df96cb98 --- /dev/null +++ b/third-party/bearssl/gcm.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2017 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "bearssl_aead.h" +#include "inner.h" + +/* + * Implementation Notes + * ==================== + * + * Since CTR and GHASH implementations can handle only full blocks, a + * 16-byte buffer (buf[]) is maintained in the context: + * + * - When processing AAD, buf[] contains the 0-15 unprocessed bytes. + * + * - When doing CTR encryption / decryption, buf[] contains the AES output + * for the last partial block, to be used with the next few bytes of + * data, as well as the already encrypted bytes. For instance, if the + * processed data length so far is 21 bytes, then buf[0..4] contains + * the five last encrypted bytes, and buf[5..15] contains the next 11 + * AES output bytes to be XORed with the next 11 bytes of input. + * + * The recorded AES output bytes are used to complete the block when + * the corresponding bytes are obtained. Note that buf[] always + * contains the _encrypted_ bytes, whether we apply encryption or + * decryption: these bytes are used as input to GHASH when the block + * is complete. + * + * In both cases, the low bits of the data length counters (count_aad, + * count_ctr) are used to work out the current situation. + */ + +/* see bearssl_aead.h */ +void +br_gcm_init(br_gcm_context *ctx, const br_block_ctr_class **bctx, br_ghash gh) +{ + unsigned char iv[12]; + + ctx->vtable = &br_gcm_vtable; + ctx->bctx = bctx; + ctx->gh = gh; + + /* + * The GHASH key h[] is the raw encryption of the all-zero + * block. Since we only have a CTR implementation, we use it + * with an all-zero IV and a zero counter, to CTR-encrypt an + * all-zero block. + */ + memset(ctx->h, 0, sizeof ctx->h); + memset(iv, 0, sizeof iv); + (*bctx)->run(bctx, iv, 0, ctx->h, sizeof ctx->h); +} + +/* see bearssl_aead.h */ +void +br_gcm_reset(br_gcm_context *ctx, const void *iv, size_t len) +{ + /* + * If the provided nonce is 12 bytes, then this is the initial + * IV for CTR mode; it will be used with a counter that starts + * at 2 (value 1 is for encrypting the GHASH output into the tag). + * + * If the provided nonce has any other length, then it is hashed + * (with GHASH) into a 16-byte value that will be the IV for CTR + * (both 12-byte IV and 32-bit counter). + */ + if (len == 12) { + memcpy(ctx->j0_1, iv, 12); + ctx->j0_2 = 1; + } else { + unsigned char ty[16], tmp[16]; + + memset(ty, 0, sizeof ty); + ctx->gh(ty, ctx->h, iv, len); + memset(tmp, 0, 8); + br_enc64be(tmp + 8, (uint64_t)len << 3); + ctx->gh(ty, ctx->h, tmp, 16); + memcpy(ctx->j0_1, ty, 12); + ctx->j0_2 = br_dec32be(ty + 12); + } + ctx->jc = ctx->j0_2 + 1; + memset(ctx->y, 0, sizeof ctx->y); + ctx->count_aad = 0; + ctx->count_ctr = 0; +} + +/* see bearssl_aead.h */ +void +br_gcm_aad_inject(br_gcm_context *ctx, const void *data, size_t len) +{ + size_t ptr, dlen; + + ptr = (size_t)ctx->count_aad & (size_t)15; + if (ptr != 0) { + /* + * If there is a partial block, then we first try to + * complete it. + */ + size_t clen; + + clen = 16 - ptr; + if (len < clen) { + memcpy(ctx->buf + ptr, data, len); + ctx->count_aad += (uint64_t)len; + return; + } + memcpy(ctx->buf + ptr, data, clen); + ctx->gh(ctx->y, ctx->h, ctx->buf, 16); + data = (const unsigned char *)data + clen; + len -= clen; + ctx->count_aad += (uint64_t)clen; + } + + /* + * Now AAD is aligned on a 16-byte block (with regards to GHASH). + * We process all complete blocks, and save the last partial + * block. + */ + dlen = len & ~(size_t)15; + ctx->gh(ctx->y, ctx->h, data, dlen); + memcpy(ctx->buf, (const unsigned char *)data + dlen, len - dlen); + ctx->count_aad += (uint64_t)len; +} + +/* see bearssl_aead.h */ +void +br_gcm_flip(br_gcm_context *ctx) +{ + /* + * We complete the GHASH computation if there is a partial block. + * The GHASH implementation automatically applies padding with + * zeros. + */ + size_t ptr; + + ptr = (size_t)ctx->count_aad & (size_t)15; + if (ptr != 0) { + ctx->gh(ctx->y, ctx->h, ctx->buf, ptr); + } +} + +/* see bearssl_aead.h */ +void +br_gcm_run(br_gcm_context *ctx, int encrypt, void *data, size_t len) +{ + unsigned char *buf; + size_t ptr, dlen; + + buf = data; + ptr = (size_t)ctx->count_ctr & (size_t)15; + if (ptr != 0) { + /* + * If we have a partial block, then we try to complete it. + */ + size_t u, clen; + + clen = 16 - ptr; + if (len < clen) { + clen = len; + } + for (u = 0; u < clen; u ++) { + unsigned x, y; + + x = buf[u]; + y = x ^ ctx->buf[ptr + u]; + ctx->buf[ptr + u] = encrypt ? y : x; + buf[u] = y; + } + ctx->count_ctr += (uint64_t)clen; + buf += clen; + len -= clen; + if (ptr + clen < 16) { + return; + } + ctx->gh(ctx->y, ctx->h, ctx->buf, 16); + } + + /* + * Process full blocks. + */ + dlen = len & ~(size_t)15; + if (!encrypt) { + ctx->gh(ctx->y, ctx->h, buf, dlen); + } + ctx->jc = (*ctx->bctx)->run(ctx->bctx, ctx->j0_1, ctx->jc, buf, dlen); + if (encrypt) { + ctx->gh(ctx->y, ctx->h, buf, dlen); + } + buf += dlen; + len -= dlen; + ctx->count_ctr += (uint64_t)dlen; + + if (len > 0) { + /* + * There is a partial block. + */ + size_t u; + + memset(ctx->buf, 0, sizeof ctx->buf); + ctx->jc = (*ctx->bctx)->run(ctx->bctx, ctx->j0_1, + ctx->jc, ctx->buf, 16); + for (u = 0; u < len; u ++) { + unsigned x, y; + + x = buf[u]; + y = x ^ ctx->buf[u]; + ctx->buf[u] = encrypt ? y : x; + buf[u] = y; + } + ctx->count_ctr += (uint64_t)len; + } +} + +/* see bearssl_aead.h */ +void +br_gcm_get_tag(br_gcm_context *ctx, void *tag) +{ + size_t ptr; + unsigned char tmp[16]; + + ptr = (size_t)ctx->count_ctr & (size_t)15; + if (ptr > 0) { + /* + * There is a partial block: encrypted/decrypted data has + * been produced, but the encrypted bytes must still be + * processed by GHASH. + */ + ctx->gh(ctx->y, ctx->h, ctx->buf, ptr); + } + + /* + * Final block for GHASH: the AAD and plaintext lengths (in bits). + */ + br_enc64be(tmp, ctx->count_aad << 3); + br_enc64be(tmp + 8, ctx->count_ctr << 3); + ctx->gh(ctx->y, ctx->h, tmp, 16); + + /* + * Tag is the GHASH output XORed with the encryption of the + * nonce with the initial counter value. + */ + memcpy(tag, ctx->y, 16); + (*ctx->bctx)->run(ctx->bctx, ctx->j0_1, ctx->j0_2, tag, 16); +} + +/* see bearssl_aead.h */ +void +br_gcm_get_tag_trunc(br_gcm_context *ctx, void *tag, size_t len) +{ + unsigned char tmp[16]; + + br_gcm_get_tag(ctx, tmp); + memcpy(tag, tmp, len); +} + +/* see bearssl_aead.h */ +uint32_t +br_gcm_check_tag_trunc(br_gcm_context *ctx, const void *tag, size_t len) +{ + unsigned char tmp[16]; + size_t u; + int x; + + br_gcm_get_tag(ctx, tmp); + x = 0; + for (u = 0; u < len; u ++) { + x |= tmp[u] ^ ((const unsigned char *)tag)[u]; + } + return EQ0(x); +} + +/* see bearssl_aead.h */ +uint32_t +br_gcm_check_tag(br_gcm_context *ctx, const void *tag) +{ + return br_gcm_check_tag_trunc(ctx, tag, 16); +} + +/* see bearssl_aead.h */ +const br_aead_class br_gcm_vtable = { + 16, + (void (*)(const br_aead_class **, const void *, size_t)) + &br_gcm_reset, + (void (*)(const br_aead_class **, const void *, size_t)) + &br_gcm_aad_inject, + (void (*)(const br_aead_class **)) + &br_gcm_flip, + (void (*)(const br_aead_class **, int, void *, size_t)) + &br_gcm_run, + (void (*)(const br_aead_class **, void *)) + &br_gcm_get_tag, + (uint32_t (*)(const br_aead_class **, const void *)) + &br_gcm_check_tag, + (void (*)(const br_aead_class **, void *, size_t)) + &br_gcm_get_tag_trunc, + (uint32_t (*)(const br_aead_class **, const void *, size_t)) + &br_gcm_check_tag_trunc +}; diff --git a/third-party/bearssl/ghash_ctmul64.c b/third-party/bearssl/ghash_ctmul64.c new file mode 100644 index 00000000..1c126ef2 --- /dev/null +++ b/third-party/bearssl/ghash_ctmul64.c @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "inner.h" + +/* + * This is the 64-bit variant of br_ghash_ctmul32(), with 64-bit operands + * and bit reversal of 64-bit words. + */ + +static inline uint64_t +bmul64(uint64_t x, uint64_t y) +{ + uint64_t x0, x1, x2, x3; + uint64_t y0, y1, y2, y3; + uint64_t z0, z1, z2, z3; + + x0 = x & (uint64_t)0x1111111111111111; + x1 = x & (uint64_t)0x2222222222222222; + x2 = x & (uint64_t)0x4444444444444444; + x3 = x & (uint64_t)0x8888888888888888; + y0 = y & (uint64_t)0x1111111111111111; + y1 = y & (uint64_t)0x2222222222222222; + y2 = y & (uint64_t)0x4444444444444444; + y3 = y & (uint64_t)0x8888888888888888; + z0 = (x0 * y0) ^ (x1 * y3) ^ (x2 * y2) ^ (x3 * y1); + z1 = (x0 * y1) ^ (x1 * y0) ^ (x2 * y3) ^ (x3 * y2); + z2 = (x0 * y2) ^ (x1 * y1) ^ (x2 * y0) ^ (x3 * y3); + z3 = (x0 * y3) ^ (x1 * y2) ^ (x2 * y1) ^ (x3 * y0); + z0 &= (uint64_t)0x1111111111111111; + z1 &= (uint64_t)0x2222222222222222; + z2 &= (uint64_t)0x4444444444444444; + z3 &= (uint64_t)0x8888888888888888; + return z0 | z1 | z2 | z3; +} + +static uint64_t +rev64(uint64_t x) +{ +#define RMS(m, s) do { \ + x = ((x & (uint64_t)(m)) << (s)) \ + | ((x >> (s)) & (uint64_t)(m)); \ + } while (0) + + RMS(0x5555555555555555, 1); + RMS(0x3333333333333333, 2); + RMS(0x0F0F0F0F0F0F0F0F, 4); + RMS(0x00FF00FF00FF00FF, 8); + RMS(0x0000FFFF0000FFFF, 16); + return (x << 32) | (x >> 32); + +#undef RMS +} + +/* see bearssl_ghash.h */ +void +br_ghash_ctmul64(void *y, const void *h, const void *data, size_t len) +{ + const unsigned char *buf, *hb; + unsigned char *yb; + uint64_t y0, y1; + uint64_t h0, h1, h2, h0r, h1r, h2r; + + buf = data; + yb = y; + hb = h; + y1 = br_dec64be(yb); + y0 = br_dec64be(yb + 8); + h1 = br_dec64be(hb); + h0 = br_dec64be(hb + 8); + h0r = rev64(h0); + h1r = rev64(h1); + h2 = h0 ^ h1; + h2r = h0r ^ h1r; + while (len > 0) { + const unsigned char *src; + unsigned char tmp[16]; + uint64_t y0r, y1r, y2, y2r; + uint64_t z0, z1, z2, z0h, z1h, z2h; + uint64_t v0, v1, v2, v3; + + if (len >= 16) { + src = buf; + buf += 16; + len -= 16; + } else { + memcpy(tmp, buf, len); + memset(tmp + len, 0, (sizeof tmp) - len); + src = tmp; + len = 0; + } + y1 ^= br_dec64be(src); + y0 ^= br_dec64be(src + 8); + + y0r = rev64(y0); + y1r = rev64(y1); + y2 = y0 ^ y1; + y2r = y0r ^ y1r; + + z0 = bmul64(y0, h0); + z1 = bmul64(y1, h1); + z2 = bmul64(y2, h2); + z0h = bmul64(y0r, h0r); + z1h = bmul64(y1r, h1r); + z2h = bmul64(y2r, h2r); + z2 ^= z0 ^ z1; + z2h ^= z0h ^ z1h; + z0h = rev64(z0h) >> 1; + z1h = rev64(z1h) >> 1; + z2h = rev64(z2h) >> 1; + + v0 = z0; + v1 = z0h ^ z2; + v2 = z1 ^ z2h; + v3 = z1h; + + v3 = (v3 << 1) | (v2 >> 63); + v2 = (v2 << 1) | (v1 >> 63); + v1 = (v1 << 1) | (v0 >> 63); + v0 = (v0 << 1); + + v2 ^= v0 ^ (v0 >> 1) ^ (v0 >> 2) ^ (v0 >> 7); + v1 ^= (v0 << 63) ^ (v0 << 62) ^ (v0 << 57); + v3 ^= v1 ^ (v1 >> 1) ^ (v1 >> 2) ^ (v1 >> 7); + v2 ^= (v1 << 63) ^ (v1 << 62) ^ (v1 << 57); + + y0 = v2; + y1 = v3; + } + + br_enc64be(yb, y1); + br_enc64be(yb + 8, y0); +} diff --git a/third-party/bearssl/inner.h b/third-party/bearssl/inner.h new file mode 100644 index 00000000..7330e95a --- /dev/null +++ b/third-party/bearssl/inner.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2016 Thomas Pornin + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef INNER_H__ +#define INNER_H__ + +#include + +static inline void +br_enc32le(void *dst, uint32_t x) +{ +#if BR_LE_UNALIGNED + ((br_union_u32 *)dst)->u = x; +#else + unsigned char *buf; + + buf = dst; + buf[0] = (unsigned char)x; + buf[1] = (unsigned char)(x >> 8); + buf[2] = (unsigned char)(x >> 16); + buf[3] = (unsigned char)(x >> 24); +#endif +} + +static inline void +br_enc32be(void *dst, uint32_t x) +{ +#if BR_BE_UNALIGNED + ((br_union_u32 *)dst)->u = x; +#else + unsigned char *buf; + + buf = dst; + buf[0] = (unsigned char)(x >> 24); + buf[1] = (unsigned char)(x >> 16); + buf[2] = (unsigned char)(x >> 8); + buf[3] = (unsigned char)x; +#endif +} + +static inline uint32_t +br_dec32le(const void *src) +{ +#if BR_LE_UNALIGNED + return ((const br_union_u32 *)src)->u; +#else + const unsigned char *buf; + + buf = src; + return (uint32_t)buf[0] + | ((uint32_t)buf[1] << 8) + | ((uint32_t)buf[2] << 16) + | ((uint32_t)buf[3] << 24); +#endif +} + +static inline uint32_t +br_dec32be(const void *src) +{ +#if BR_BE_UNALIGNED + return ((const br_union_u32 *)src)->u; +#else + const unsigned char *buf; + + buf = src; + return ((uint32_t)buf[0] << 24) + | ((uint32_t)buf[1] << 16) + | ((uint32_t)buf[2] << 8) + | (uint32_t)buf[3]; +#endif +} + +static inline void +br_enc64be(void *dst, uint64_t x) +{ +#if BR_BE_UNALIGNED + ((br_union_u64 *)dst)->u = x; +#else + unsigned char *buf; + + buf = dst; + br_enc32be(buf, (uint32_t)(x >> 32)); + br_enc32be(buf + 4, (uint32_t)x); +#endif +} + +static inline uint64_t +br_dec64be(const void *src) +{ +#if BR_BE_UNALIGNED + return ((const br_union_u64 *)src)->u; +#else + const unsigned char *buf; + + buf = src; + return ((uint64_t)br_dec32be(buf) << 32) + | (uint64_t)br_dec32be(buf + 4); +#endif +} + +/* + * Range decoding and encoding (for several successive values). + */ +void br_range_dec32le(uint32_t *v, size_t num, const void *src); +void br_range_enc32le(void *dst, const uint32_t *v, size_t num); + +/* + * Byte-swap a 32-bit integer. + */ +static inline uint32_t +br_swap32(uint32_t x) +{ + x = ((x & (uint32_t)0x00FF00FF) << 8) + | ((x >> 8) & (uint32_t)0x00FF00FF); + return (x << 16) | (x >> 16); +} + +/* + * Returns 1 if x == 0, 0 otherwise. Take care that the operand is signed. + */ +static inline uint32_t +EQ0(int32_t x) +{ + uint32_t q; + + q = (uint32_t)x; + return ~(q | -q) >> 31; +} + +/* + * Perform bytewise orthogonalization of eight 64-bit words. Bytes + * of q0..q7 are spread over all words: for a byte x that occurs + * at rank i in q[j] (byte x uses bits 8*i to 8*i+7 in q[j]), the bit + * of rank k in x (0 <= k <= 7) goes to q[k] at rank 8*i+j. + * + * This operation is an involution. + */ +void br_aes_ct64_ortho(uint64_t *q); + +/* + * Interleave bytes for an AES input block. If input bytes are + * denoted 0123456789ABCDEF, and have been decoded with little-endian + * convention (w[0] contains 0123, with '3' being most significant; + * w[1] contains 4567, and so on), then output word q0 will be + * set to 08192A3B (again little-endian convention) and q1 will + * be set to 4C5D6E7F. + */ +void br_aes_ct64_interleave_in(uint64_t *q0, uint64_t *q1, const uint32_t *w); + +/* + * Perform the opposite of br_aes_ct64_interleave_in(). + */ +void br_aes_ct64_interleave_out(uint32_t *w, uint64_t q0, uint64_t q1); + +/* + * The AES S-box, as a bitsliced constant-time version. The input array + * consists in eight 64-bit words; 64 S-box instances are computed in + * parallel. Bits 0 to 7 of each S-box input (bit 0 is least significant) + * are spread over the words 0 to 7, at the same rank. + */ +void br_aes_ct64_bitslice_Sbox(uint64_t *q); + +/* + * Compute AES encryption on bitsliced data. Since input is stored on + * eight 64-bit words, four block encryptions are actually performed + * in parallel. + */ +void br_aes_ct64_bitslice_encrypt(unsigned num_rounds, + const uint64_t *skey, uint64_t *q); + +/* + * AES key schedule, constant-time version. skey[] is filled with n+1 + * 128-bit subkeys, where n is the number of rounds (10 to 14, depending + * on key size). The number of rounds is returned. If the key size is + * invalid (not 16, 24 or 32), then 0 is returned. + */ +unsigned br_aes_ct64_keysched(uint64_t *comp_skey, + const void *key, size_t key_len); + +/* + * Expand AES subkeys as produced by br_aes_ct64_keysched(), into + * a larger array suitable for br_aes_ct64_bitslice_encrypt() and + * br_aes_ct64_bitslice_decrypt(). + */ +void br_aes_ct64_skey_expand(uint64_t *skey, + unsigned num_rounds, const uint64_t *comp_skey); + +/* ==================================================================== */ + +#endif diff --git a/tools/configure.py b/tools/configure.py index 723446b2..a8aab8e5 100644 --- a/tools/configure.py +++ b/tools/configure.py @@ -141,6 +141,8 @@ def __init__( # If vale is compiled add the include path if len(self.vale_files) != 0: self.include_paths.extend(self.config["vale_include_paths"]) + # Set cpu-features as include paths + self.include_paths.extend(self.config["cpu_features_include_paths"]) # If the build directory is empty, copy the `default_config.h` there to # make the dependency analysis work. From 5d40ece8b989015555497cdd505bbff7ea3a74d4 Mon Sep 17 00:00:00 2001 From: mamonet <66893036+mamonet@users.noreply.github.com> Date: Tue, 18 Jul 2023 12:47:20 +0300 Subject: [PATCH 02/10] Add updated AES_GCM files to MSVC --- include/msvc/Hacl_AES_128_BitSlice.h | 73 ++ include/msvc/Hacl_AES_128_GCM_M32.h | 76 ++ include/msvc/Hacl_AES_128_GCM_NI.h | 75 ++ include/msvc/Hacl_AES_128_NI.h | 81 ++ include/msvc/Hacl_Gf128_NI.h | 65 + include/msvc/Hacl_Gf128_PreComp.h | 54 + include/msvc/internal/Hacl_AES_128_BitSlice.h | 83 ++ include/msvc/internal/Hacl_Lib.h | 69 + include/msvc/libintvector.h | 63 +- src/msvc/EverCrypt_AEAD.c | 204 ++- src/msvc/Hacl_AES_128_BitSlice.c | 1105 +++++++++++++++++ src/msvc/Hacl_AES_128_GCM_M32.c | 208 ++++ src/msvc/Hacl_AES_128_GCM_NI.c | 409 ++++++ src/msvc/Hacl_AES_128_NI.c | 1084 ++++++++++++++++ src/msvc/Hacl_Gf128_NI.c | 359 ++++++ src/msvc/Hacl_Gf128_PreComp.c | 461 +++++++ src/msvc/Hacl_Lib.c | 193 +++ 17 files changed, 4645 insertions(+), 17 deletions(-) create mode 100644 include/msvc/Hacl_AES_128_BitSlice.h create mode 100644 include/msvc/Hacl_AES_128_GCM_M32.h create mode 100644 include/msvc/Hacl_AES_128_GCM_NI.h create mode 100644 include/msvc/Hacl_AES_128_NI.h create mode 100644 include/msvc/Hacl_Gf128_NI.h create mode 100644 include/msvc/Hacl_Gf128_PreComp.h create mode 100644 include/msvc/internal/Hacl_AES_128_BitSlice.h create mode 100644 include/msvc/internal/Hacl_Lib.h create mode 100644 src/msvc/Hacl_AES_128_BitSlice.c create mode 100644 src/msvc/Hacl_AES_128_GCM_M32.c create mode 100644 src/msvc/Hacl_AES_128_GCM_NI.c create mode 100644 src/msvc/Hacl_AES_128_NI.c create mode 100644 src/msvc/Hacl_Gf128_NI.c create mode 100644 src/msvc/Hacl_Gf128_PreComp.c create mode 100644 src/msvc/Hacl_Lib.c diff --git a/include/msvc/Hacl_AES_128_BitSlice.h b/include/msvc/Hacl_AES_128_BitSlice.h new file mode 100644 index 00000000..3a146a89 --- /dev/null +++ b/include/msvc/Hacl_AES_128_BitSlice.h @@ -0,0 +1,73 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_128_BitSlice_H +#define __Hacl_AES_128_BitSlice_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +typedef uint64_t *Hacl_AES_128_BitSlice_aes_ctx; + +typedef uint8_t *Hacl_AES_128_BitSlice_skey; + +void Hacl_AES_128_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce); + +void Hacl_AES_128_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce); + +void Hacl_AES_128_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter); + +void +Hacl_AES_128_BitSlice_aes128_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +void +Hacl_AES_128_BitSlice_aes128_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_128_BitSlice_H_DEFINED +#endif diff --git a/include/msvc/Hacl_AES_128_GCM_M32.h b/include/msvc/Hacl_AES_128_GCM_M32.h new file mode 100644 index 00000000..29125377 --- /dev/null +++ b/include/msvc/Hacl_AES_128_GCM_M32.h @@ -0,0 +1,76 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_128_GCM_M32_H +#define __Hacl_AES_128_GCM_M32_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "Hacl_Gf128_PreComp.h" +#include "Hacl_AES_128_BitSlice.h" + +extern uint32_t Hacl_AES_128_GCM_M32_aes_gcm_ctx_len; + +typedef uint64_t *Hacl_AES_128_GCM_M32_aes_gcm_ctx; + +void Hacl_AES_128_GCM_M32_aes128_gcm_init(uint64_t *ctx, uint8_t *key); + +void +Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +bool +Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_128_GCM_M32_H_DEFINED +#endif diff --git a/include/msvc/Hacl_AES_128_GCM_NI.h b/include/msvc/Hacl_AES_128_GCM_NI.h new file mode 100644 index 00000000..ab520316 --- /dev/null +++ b/include/msvc/Hacl_AES_128_GCM_NI.h @@ -0,0 +1,75 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_128_GCM_NI_H +#define __Hacl_AES_128_GCM_NI_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "Hacl_Gf128_NI.h" +#include "Hacl_AES_128_NI.h" +#include "libintvector.h" + +typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_128_GCM_NI_aes_gcm_ctx; + +void Hacl_AES_128_GCM_NI_aes128_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key); + +void +Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +bool +Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_128_GCM_NI_H_DEFINED +#endif diff --git a/include/msvc/Hacl_AES_128_NI.h b/include/msvc/Hacl_AES_128_NI.h new file mode 100644 index 00000000..f3c148b5 --- /dev/null +++ b/include/msvc/Hacl_AES_128_NI.h @@ -0,0 +1,81 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_128_NI_H +#define __Hacl_AES_128_NI_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "libintvector.h" + +typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_128_NI_aes_ctx; + +typedef uint8_t *Hacl_AES_128_NI_skey; + +void +Hacl_AES_128_NI_aes128_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key, uint8_t *nonce); + +void Hacl_AES_128_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce); + +void +Hacl_AES_128_NI_aes128_key_block( + uint8_t *kb, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t counter +); + +void +Hacl_AES_128_NI_aes128_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +void +Hacl_AES_128_NI_aes128_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_128_NI_H_DEFINED +#endif diff --git a/include/msvc/Hacl_Gf128_NI.h b/include/msvc/Hacl_Gf128_NI.h new file mode 100644 index 00000000..46322a32 --- /dev/null +++ b/include/msvc/Hacl_Gf128_NI.h @@ -0,0 +1,65 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_Gf128_NI_H +#define __Hacl_Gf128_NI_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "libintvector.h" + +void Hacl_Gf128_NI_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key); + +void +Hacl_Gf128_NI_gcm_update_blocks( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *text +); + +extern void +(*Hacl_Gf128_NI_gcm_update_padded)( + Lib_IntVector_Intrinsics_vec128 *x0, + uint32_t x1, + uint8_t *x2 +); + +void Hacl_Gf128_NI_gcm_emit(uint8_t *tag, Lib_IntVector_Intrinsics_vec128 *ctx); + +void Hacl_Gf128_NI_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_Gf128_NI_H_DEFINED +#endif diff --git a/include/msvc/Hacl_Gf128_PreComp.h b/include/msvc/Hacl_Gf128_PreComp.h new file mode 100644 index 00000000..3d67add1 --- /dev/null +++ b/include/msvc/Hacl_Gf128_PreComp.h @@ -0,0 +1,54 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_Gf128_PreComp_H +#define __Hacl_Gf128_PreComp_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +void Hacl_Gf128_PreComp_gcm_init(uint64_t *ctx, uint8_t *key); + +void Hacl_Gf128_PreComp_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text); + +extern void +(*Hacl_Gf128_PreComp_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2); + +void Hacl_Gf128_PreComp_gcm_emit(uint8_t *tag, uint64_t *ctx); + +void Hacl_Gf128_PreComp_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_Gf128_PreComp_H_DEFINED +#endif diff --git a/include/msvc/internal/Hacl_AES_128_BitSlice.h b/include/msvc/internal/Hacl_AES_128_BitSlice.h new file mode 100644 index 00000000..3b95bb9b --- /dev/null +++ b/include/msvc/internal/Hacl_AES_128_BitSlice.h @@ -0,0 +1,83 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __internal_Hacl_AES_128_BitSlice_H +#define __internal_Hacl_AES_128_BitSlice_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "internal/Hacl_Lib.h" +#include "../Hacl_AES_128_BitSlice.h" + +void Hacl_Impl_AES_CoreBitSlice_store_block0(uint8_t *out, uint64_t *inp); + +void Hacl_Impl_AES_CoreBitSlice_load_key1(uint64_t *out, uint8_t *k); + +void Hacl_Impl_AES_CoreBitSlice_load_nonce(uint64_t *out, uint8_t *nonce1); + +void Hacl_Impl_AES_CoreBitSlice_load_state(uint64_t *out, uint64_t *nonce1, uint32_t counter); + +void Hacl_Impl_AES_CoreBitSlice_xor_state_key1(uint64_t *st, uint64_t *ost); + +void Hacl_Impl_AES_CoreBitSlice_aes_enc(uint64_t *st, uint64_t *key); + +void Hacl_Impl_AES_CoreBitSlice_aes_enc_last(uint64_t *st, uint64_t *key); + +void +Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(uint64_t *next, uint64_t *prev, uint8_t rcon1); + +void Hacl_Impl_AES_CoreBitSlice_key_expansion_step(uint64_t *next, uint64_t *prev); + +void +Hacl_Impl_AES_Generic_aes128_ctr_bitslice( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t counter +); + +void +Hacl_Impl_AES_Generic_aes256_ctr_bitslice( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t counter +); + +#if defined(__cplusplus) +} +#endif + +#define __internal_Hacl_AES_128_BitSlice_H_DEFINED +#endif diff --git a/include/msvc/internal/Hacl_Lib.h b/include/msvc/internal/Hacl_Lib.h new file mode 100644 index 00000000..61b523ff --- /dev/null +++ b/include/msvc/internal/Hacl_Lib.h @@ -0,0 +1,69 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __internal_Hacl_Lib_H +#define __internal_Hacl_Lib_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +typedef struct Lib_Transposition64x8_uint64x2_s +{ + uint64_t fst; + uint64_t snd; +} +Lib_Transposition64x8_uint64x2; + +typedef struct Lib_Transposition64x8_uint64x4_s +{ + Lib_Transposition64x8_uint64x2 fst; + Lib_Transposition64x8_uint64x2 snd; +} +Lib_Transposition64x8_uint64x4; + +typedef struct Lib_Transposition64x8_uint64x8_s +{ + Lib_Transposition64x8_uint64x4 fst; + Lib_Transposition64x8_uint64x4 snd; +} +Lib_Transposition64x8_uint64x8; + +uint64_t Lib_Transposition64x8_transpose_bits64(uint64_t x); + +Lib_Transposition64x8_uint64x8 +Lib_Transposition64x8_transpose_bits64x8(Lib_Transposition64x8_uint64x8 a); + +#if defined(__cplusplus) +} +#endif + +#define __internal_Hacl_Lib_H_DEFINED +#endif diff --git a/include/msvc/libintvector.h b/include/msvc/libintvector.h index 99d11336..4c259dba 100644 --- a/include/msvc/libintvector.h +++ b/include/msvc/libintvector.h @@ -121,12 +121,18 @@ typedef __m128i Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_load64_le(x0) \ (_mm_loadu_si128((__m128i*)(x0))) +#define Lib_IntVector_Intrinsics_vec128_load128_le(x0) \ + (_mm_loadu_si128((__m128i*)(x0))) + #define Lib_IntVector_Intrinsics_vec128_store32_le(x0, x1) \ (_mm_storeu_si128((__m128i*)(x0), x1)) #define Lib_IntVector_Intrinsics_vec128_store64_le(x0, x1) \ (_mm_storeu_si128((__m128i*)(x0), x1)) +#define Lib_IntVector_Intrinsics_vec128_store128_le(x0, x1) \ + (_mm_storeu_si128((__m128i*)(x0), x1)) + #define Lib_IntVector_Intrinsics_vec128_load_be(x0) \ (_mm_shuffle_epi8(_mm_loadu_si128((__m128i*)(x0)), _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15))) @@ -456,6 +462,30 @@ typedef __m256i Lib_IntVector_Intrinsics_vec256; typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; +#if defined(__ARM_FEATURE_AES) + +#define Lib_IntVector_Intrinsics_ni_aes_enc(x0, x1) \ + ((uint32x4_t)(vaesmcq_u8(vaeseq_u8((uint8x16_t)x0, (uint8x16_t){})) ^ (uint8x16_t)x1)) + +#define Lib_IntVector_Intrinsics_ni_aes_enc_last(x0, x1) \ + ((uint32x4_t)(vaeseq_u8((uint8x16_t)x0, (uint8x16_t){}) ^ (uint8x16_t)x1)) + +static inline Lib_IntVector_Intrinsics_vec128 Lib_IntVector_Intrinsics_ni_aes_keygen_assist (Lib_IntVector_Intrinsics_vec128 x0, uint8_t x1){ + uint8x16_t tmp = vaeseq_u8((uint8x16_t)x0, (uint8x16_t){}); + return (uint32x4_t)((uint8x16_t){ + tmp[4], tmp[1], tmp[14], tmp[11], + tmp[1], tmp[14], tmp[11], tmp[4], + tmp[12], tmp[9], tmp[6], tmp[3], + tmp[9], tmp[6], tmp[3], tmp[12] + } ^ (uint8x16_t)(uint32x4_t){0, x1, 0, x1}); +} + +#define Lib_IntVector_Intrinsics_ni_clmul(x0, x1, x2) \ + ((x2) == 0x11? (uint32x4_t)vmull_high_p64((poly64x2_t)x0, (poly64x2_t)x1) : \ + (uint32x4_t)vmull_p64(vgetq_lane_u64((uint64x2_t)x0,(x2)&1), vgetq_lane_u64((uint64x2_t)x1,(x2)>>4))) + +#endif + #define Lib_IntVector_Intrinsics_vec128_xor(x0, x1) \ (veorq_u32(x0,x1)) @@ -486,12 +516,11 @@ typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_lognot(x0) \ (vmvnq_u32(x0)) - #define Lib_IntVector_Intrinsics_vec128_shift_left(x0, x1) \ - (vextq_u32(x0, vdupq_n_u8(0), 16-(x1)/8)) + ((uint32x4_t)vextq_u8(vdupq_n_u8(0), (uint8x16_t)x0, 16-(x1)/8)) #define Lib_IntVector_Intrinsics_vec128_shift_right(x0, x1) \ - (vextq_u32(x0, vdupq_n_u8(0), (x1)/8)) + ((uint32x4_t)vextq_u8((uint8x16_t)x0, vdupq_n_u8(0), (x1)/8)) #define Lib_IntVector_Intrinsics_vec128_shift_left64(x0, x1) \ (vreinterpretq_u32_u64(vshlq_n_u64(vreinterpretq_u64_u32(x0), x1))) @@ -525,11 +554,10 @@ typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_rotate_right_lanes64(x0, x1) \ (vextq_u64(x0,x0,x1)) - -/* #define Lib_IntVector_Intrinsics_vec128_shuffle32(x0, x1, x2, x3, x4) \ - (_mm_shuffle_epi32(x0, _MM_SHUFFLE(x1,x2,x3,x4))) + ((uint32x4_t){((uint32x4_t)x0)[x1],((uint32x4_t)x0)[x2],((uint32x4_t)x0)[x3],((uint32x4_t)x0)[x4]}) +/* #define Lib_IntVector_Intrinsics_vec128_shuffle64(x0, x1, x2) \ (_mm_shuffle_epi32(x0, _MM_SHUFFLE(2*x1+1,2*x1,2*x2+1,2*x2))) */ @@ -540,17 +568,17 @@ typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_load64_le(x0) \ (vld1q_u32((const uint32_t*) (x0))) +#define Lib_IntVector_Intrinsics_vec128_load128_le(x0) \ + ((uint32x4_t)vld1q_u8((uint8_t*)x0)) + #define Lib_IntVector_Intrinsics_vec128_store32_le(x0, x1) \ (vst1q_u32((uint32_t*)(x0),(x1))) #define Lib_IntVector_Intrinsics_vec128_store64_le(x0, x1) \ (vst1q_u32((uint32_t*)(x0),(x1))) -/* -#define Lib_IntVector_Intrinsics_vec128_load_be(x0) \ - ( Lib_IntVector_Intrinsics_vec128 l = vrev64q_u8(vld1q_u32((uint32_t*)(x0))); - -*/ +#define Lib_IntVector_Intrinsics_vec128_store128_le(x0, x1) \ + (vst1q_u8((uint8_t*)x0, (uint8x16_t)x1)) #define Lib_IntVector_Intrinsics_vec128_load32_be(x0) \ (vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t*)(x0)))))) @@ -558,10 +586,10 @@ typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_load64_be(x0) \ (vreinterpretq_u32_u8(vrev64q_u8(vreinterpretq_u8_u32(vld1q_u32((const uint32_t*)(x0)))))) -/* -#define Lib_IntVector_Intrinsics_vec128_store_be(x0, x1) \ - (_mm_storeu_si128((__m128i*)(x0), _mm_shuffle_epi8(x1, _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15)))) -*/ +static inline Lib_IntVector_Intrinsics_vec128 Lib_IntVector_Intrinsics_vec128_load_be(uint8_t* x0){ + uint64x2_t l = (uint64x2_t)vrev64q_u8(vld1q_u8(x0)); + return (uint32x4_t)vextq_u64(l, l, 1); +} #define Lib_IntVector_Intrinsics_vec128_store32_be(x0, x1) \ (vst1q_u32((uint32_t*)(x0),(vreinterpretq_u32_u8(vrev32q_u8(vreinterpretq_u8_u32(x1)))))) @@ -569,6 +597,11 @@ typedef uint32x4_t Lib_IntVector_Intrinsics_vec128; #define Lib_IntVector_Intrinsics_vec128_store64_be(x0, x1) \ (vst1q_u32((uint32_t*)(x0),(vreinterpretq_u32_u8(vrev64q_u8(vreinterpretq_u8_u32(x1)))))) +static inline void Lib_IntVector_Intrinsics_vec128_store_be(uint8_t* x0, Lib_IntVector_Intrinsics_vec128 x1){ + uint64x2_t l = (uint64x2_t)vrev64q_u8((uint8x16_t)x1); + vst1q_u8(x0, (uint8x16_t)vextq_u64(l, l, 1)); +} + #define Lib_IntVector_Intrinsics_vec128_insert8(x0, x1, x2) \ (vsetq_lane_u8(x1,x0,x2)) diff --git a/src/msvc/EverCrypt_AEAD.c b/src/msvc/EverCrypt_AEAD.c index a4b306b7..6c21c319 100644 --- a/src/msvc/EverCrypt_AEAD.c +++ b/src/msvc/EverCrypt_AEAD.c @@ -26,8 +26,13 @@ #include "EverCrypt_AEAD.h" #include "internal/Vale.h" +#ifdef HACL_CAN_COMPILE_AESNI_PCLMUL +#include "Hacl_AES_128_GCM_NI.h" +#endif +#include "Hacl_AES_128_GCM_M32.h" #include "internal/Hacl_Spec.h" #include "config.h" +#include "hacl-cpu-features.h" /** Both encryption and decryption require a state that holds the key. @@ -63,10 +68,14 @@ Spec_Agile_AEAD_alg EverCrypt_AEAD_alg_of_state(EverCrypt_AEAD_state_s *s) return Spec_Agile_AEAD_CHACHA20_POLY1305; } case Spec_Cipher_Expansion_Vale_AES128: + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + case Spec_Cipher_Expansion_M32_AES128: { return Spec_Agile_AEAD_AES128_GCM; } case Spec_Cipher_Expansion_Vale_AES256: + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES256: + case Spec_Cipher_Expansion_M32_AES256: { return Spec_Agile_AEAD_AES256_GCM; } @@ -93,12 +102,12 @@ create_in_chacha20_poly1305(EverCrypt_AEAD_state_s **dst, uint8_t *k) static EverCrypt_Error_error_code create_in_aes128_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) { + #if HACL_CAN_COMPILE_VALE bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)480U, sizeof (uint8_t)); @@ -112,8 +121,31 @@ create_in_aes128_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) *dst = p; return EverCrypt_Error_Success; } + else + #elif HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)352U, sizeof (uint8_t)); + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx, k); + EverCrypt_AEAD_state_s + *p = (EverCrypt_AEAD_state_s *)KRML_HOST_MALLOC(sizeof (EverCrypt_AEAD_state_s)); + p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES128, .ek = ek }); + *dst = p; + return EverCrypt_Error_Success; + } + else #endif - return EverCrypt_Error_UnsupportedAlgorithm; + { + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)3168U, sizeof (uint8_t)); + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + Hacl_AES_128_GCM_M32_aes128_gcm_init(aes_gcm_ctx, k); + EverCrypt_AEAD_state_s + *p = (EverCrypt_AEAD_state_s *)KRML_HOST_MALLOC(sizeof (EverCrypt_AEAD_state_s)); + p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_M32_AES128, .ek = ek }); + *dst = p; + return EverCrypt_Error_Success; + } } static EverCrypt_Error_error_code @@ -306,6 +338,78 @@ encrypt_aes128_gcm( #endif } +static EverCrypt_Error_error_code +encrypt_aes128_gcm_aesni_pclmul( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *plain, + uint32_t plain_len, + uint8_t *cipher, + uint8_t *tag +) +{ + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_NI_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + return EverCrypt_Error_Success; + #else + KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", + __FILE__, + __LINE__, + "statically unreachable"); + KRML_HOST_EXIT(255U); + #endif +} + +static EverCrypt_Error_error_code +encrypt_aes128_gcm_m32( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *plain, + uint32_t plain_len, + uint8_t *cipher, + uint8_t *tag +) +{ + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_M32_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + return EverCrypt_Error_Success; +} + static EverCrypt_Error_error_code encrypt_aes256_gcm( EverCrypt_AEAD_state_s *s, @@ -488,6 +592,14 @@ EverCrypt_AEAD_encrypt( EverCrypt_Chacha20Poly1305_aead_encrypt(ek, iv, ad_len, ad, plain_len, plain, cipher, tag); return EverCrypt_Error_Success; } + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + { + return encrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } + case Spec_Cipher_Expansion_M32_AES128: + { + return encrypt_aes128_gcm_m32(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } default: { KRML_HOST_EPRINTF("KaRaMeL incomplete match at %s:%d\n", __FILE__, __LINE__); @@ -1282,6 +1394,86 @@ decrypt_aes128_gcm( #endif } +static EverCrypt_Error_error_code +decrypt_aes128_gcm_aesni_pclmul( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *cipher, + uint32_t cipher_len, + uint8_t *tag, + uint8_t *dst +) +{ + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_NI_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + #else + KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", + __FILE__, + __LINE__, + "statically unreachable"); + KRML_HOST_EXIT(255U); + #endif +} + +static EverCrypt_Error_error_code +decrypt_aes128_gcm_m32( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *cipher, + uint32_t cipher_len, + uint8_t *tag, + uint8_t *dst +) +{ + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_M32_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; +} + static EverCrypt_Error_error_code decrypt_aes256_gcm( EverCrypt_AEAD_state_s *s, @@ -1512,6 +1704,14 @@ EverCrypt_AEAD_decrypt( { return decrypt_chacha20_poly1305(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + { + return decrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + } + case Spec_Cipher_Expansion_M32_AES128: + { + return decrypt_aes128_gcm_m32(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + } default: { KRML_HOST_EPRINTF("KaRaMeL incomplete match at %s:%d\n", __FILE__, __LINE__); diff --git a/src/msvc/Hacl_AES_128_BitSlice.c b/src/msvc/Hacl_AES_128_BitSlice.c new file mode 100644 index 00000000..a0d2938f --- /dev/null +++ b/src/msvc/Hacl_AES_128_BitSlice.c @@ -0,0 +1,1105 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "internal/Hacl_AES_128_BitSlice.h" + +#include "internal/Hacl_Lib.h" + +typedef struct __uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_s +{ + uint64_t fst; + uint64_t snd; + uint64_t thd; + uint64_t f3; + uint64_t f4; + uint64_t f5; + uint64_t f6; + uint64_t f7; +} +__uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t; + +static __uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t +sub_bytes64x8( + uint64_t st0, + uint64_t st1, + uint64_t st2, + uint64_t st3, + uint64_t st4, + uint64_t st5, + uint64_t st6, + uint64_t st7 +) +{ + uint64_t input[8U] = { 0U }; + input[0U] = st0; + input[1U] = st1; + input[2U] = st2; + input[3U] = st3; + input[4U] = st4; + input[5U] = st5; + input[6U] = st6; + input[7U] = st7; + uint64_t output[8U] = { 0U }; + uint64_t tmp[121U] = { 0U }; + tmp[0U] = input[7U]; + tmp[1U] = input[6U]; + tmp[2U] = input[5U]; + tmp[3U] = input[4U]; + tmp[4U] = input[3U]; + tmp[5U] = input[2U]; + tmp[6U] = input[1U]; + tmp[7U] = input[0U]; + tmp[8U] = tmp[3U] ^ tmp[5U]; + tmp[9U] = tmp[0U] ^ tmp[6U]; + tmp[10U] = tmp[0U] ^ tmp[3U]; + tmp[11U] = tmp[0U] ^ tmp[5U]; + tmp[12U] = tmp[1U] ^ tmp[2U]; + tmp[13U] = tmp[12U] ^ tmp[7U]; + tmp[14U] = tmp[13U] ^ tmp[3U]; + tmp[15U] = tmp[9U] ^ tmp[8U]; + tmp[16U] = tmp[13U] ^ tmp[0U]; + tmp[17U] = tmp[13U] ^ tmp[6U]; + tmp[18U] = tmp[17U] ^ tmp[11U]; + tmp[19U] = tmp[4U] ^ tmp[15U]; + tmp[20U] = tmp[19U] ^ tmp[5U]; + tmp[21U] = tmp[19U] ^ tmp[1U]; + tmp[22U] = tmp[20U] ^ tmp[7U]; + tmp[23U] = tmp[20U] ^ tmp[12U]; + tmp[24U] = tmp[21U] ^ tmp[10U]; + tmp[25U] = tmp[7U] ^ tmp[24U]; + tmp[26U] = tmp[23U] ^ tmp[24U]; + tmp[27U] = tmp[23U] ^ tmp[11U]; + tmp[28U] = tmp[12U] ^ tmp[24U]; + tmp[29U] = tmp[9U] ^ tmp[28U]; + tmp[30U] = tmp[0U] ^ tmp[28U]; + tmp[31U] = tmp[15U] & tmp[20U]; + tmp[32U] = tmp[18U] & tmp[22U]; + tmp[33U] = tmp[32U] ^ tmp[31U]; + tmp[34U] = tmp[14U] & tmp[7U]; + tmp[35U] = tmp[34U] ^ tmp[31U]; + tmp[36U] = tmp[9U] & tmp[28U]; + tmp[37U] = tmp[17U] & tmp[13U]; + tmp[38U] = tmp[37U] ^ tmp[36U]; + tmp[39U] = tmp[16U] & tmp[25U]; + tmp[40U] = tmp[39U] ^ tmp[36U]; + tmp[41U] = tmp[10U] & tmp[24U]; + tmp[42U] = tmp[8U] & tmp[26U]; + tmp[43U] = tmp[42U] ^ tmp[41U]; + tmp[44U] = tmp[11U] & tmp[23U]; + tmp[45U] = tmp[44U] ^ tmp[41U]; + tmp[46U] = tmp[33U] ^ tmp[21U]; + tmp[47U] = tmp[35U] ^ tmp[45U]; + tmp[48U] = tmp[38U] ^ tmp[43U]; + tmp[49U] = tmp[40U] ^ tmp[45U]; + tmp[50U] = tmp[46U] ^ tmp[43U]; + tmp[51U] = tmp[47U] ^ tmp[27U]; + tmp[52U] = tmp[48U] ^ tmp[29U]; + tmp[53U] = tmp[49U] ^ tmp[30U]; + tmp[54U] = tmp[50U] ^ tmp[51U]; + tmp[55U] = tmp[50U] & tmp[52U]; + tmp[56U] = tmp[53U] ^ tmp[55U]; + tmp[57U] = tmp[54U] & tmp[56U]; + tmp[58U] = tmp[57U] ^ tmp[51U]; + tmp[59U] = tmp[52U] ^ tmp[53U]; + tmp[60U] = tmp[51U] ^ tmp[55U]; + tmp[61U] = tmp[60U] & tmp[59U]; + tmp[62U] = tmp[61U] ^ tmp[53U]; + tmp[63U] = tmp[52U] ^ tmp[62U]; + tmp[64U] = tmp[56U] ^ tmp[62U]; + tmp[65U] = tmp[53U] & tmp[64U]; + tmp[66U] = tmp[65U] ^ tmp[63U]; + tmp[67U] = tmp[56U] ^ tmp[65U]; + tmp[68U] = tmp[58U] & tmp[67U]; + tmp[69U] = tmp[54U] ^ tmp[68U]; + tmp[70U] = tmp[69U] ^ tmp[66U]; + tmp[71U] = tmp[58U] ^ tmp[62U]; + tmp[72U] = tmp[58U] ^ tmp[69U]; + tmp[73U] = tmp[62U] ^ tmp[66U]; + tmp[74U] = tmp[71U] ^ tmp[70U]; + tmp[75U] = tmp[73U] & tmp[20U]; + tmp[76U] = tmp[66U] & tmp[22U]; + tmp[77U] = tmp[62U] & tmp[7U]; + tmp[78U] = tmp[72U] & tmp[28U]; + tmp[79U] = tmp[69U] & tmp[13U]; + tmp[80U] = tmp[58U] & tmp[25U]; + tmp[81U] = tmp[71U] & tmp[24U]; + tmp[82U] = tmp[74U] & tmp[26U]; + tmp[83U] = tmp[70U] & tmp[23U]; + tmp[84U] = tmp[73U] & tmp[15U]; + tmp[85U] = tmp[66U] & tmp[18U]; + tmp[86U] = tmp[62U] & tmp[14U]; + tmp[87U] = tmp[72U] & tmp[9U]; + tmp[88U] = tmp[69U] & tmp[17U]; + tmp[89U] = tmp[58U] & tmp[16U]; + tmp[90U] = tmp[71U] & tmp[10U]; + tmp[91U] = tmp[74U] & tmp[8U]; + tmp[92U] = tmp[70U] & tmp[11U]; + tmp[93U] = tmp[90U] ^ tmp[91U]; + tmp[94U] = tmp[85U] ^ tmp[93U]; + tmp[95U] = tmp[84U] ^ tmp[94U]; + tmp[96U] = tmp[75U] ^ tmp[77U]; + tmp[97U] = tmp[76U] ^ tmp[75U]; + tmp[98U] = tmp[78U] ^ tmp[79U]; + tmp[99U] = tmp[87U] ^ tmp[96U]; + tmp[100U] = tmp[82U] ^ tmp[98U]; + tmp[101U] = tmp[83U] ^ tmp[99U]; + tmp[102U] = tmp[100U] ^ tmp[101U]; + tmp[103U] = tmp[98U] ^ tmp[97U]; + tmp[104U] = tmp[78U] ^ tmp[80U]; + tmp[105U] = tmp[88U] ^ tmp[93U]; + tmp[106U] = tmp[96U] ^ tmp[104U]; + tmp[107U] = tmp[95U] ^ tmp[103U]; + tmp[108U] = tmp[81U] ^ tmp[100U]; + tmp[109U] = tmp[89U] ^ tmp[102U]; + tmp[110U] = tmp[105U] ^ tmp[106U]; + uint64_t uu____0 = tmp[87U]; + uint64_t uu____1 = tmp[110U]; + tmp[111U] = (~uu____0 & ~uu____1) | (uu____0 & uu____1); + tmp[112U] = tmp[90U] ^ tmp[108U]; + tmp[113U] = tmp[94U] ^ tmp[86U]; + tmp[114U] = tmp[95U] ^ tmp[108U]; + uint64_t uu____2 = tmp[102U]; + uint64_t uu____3 = tmp[110U]; + tmp[115U] = (~uu____2 & ~uu____3) | (uu____2 & uu____3); + tmp[116U] = tmp[106U] ^ tmp[107U]; + uint64_t uu____4 = tmp[107U]; + uint64_t uu____5 = tmp[108U]; + tmp[117U] = (~uu____4 & ~uu____5) | (uu____4 & uu____5); + tmp[118U] = tmp[109U] ^ tmp[112U]; + uint64_t uu____6 = tmp[118U]; + uint64_t uu____7 = tmp[92U]; + tmp[119U] = (~uu____6 & ~uu____7) | (uu____6 & uu____7); + tmp[120U] = tmp[113U] ^ tmp[109U]; + uint64_t o = tmp[111U]; + output[0U] = o; + uint64_t o0 = tmp[115U]; + output[1U] = o0; + uint64_t o8 = tmp[120U]; + output[2U] = o8; + uint64_t o9 = tmp[116U]; + output[3U] = o9; + uint64_t o10 = tmp[107U]; + output[4U] = o10; + uint64_t o11 = tmp[119U]; + output[5U] = o11; + uint64_t o12 = tmp[117U]; + output[6U] = o12; + uint64_t o13 = tmp[114U]; + output[7U] = o13; + uint64_t o00 = output[0U]; + uint64_t o1 = output[1U]; + uint64_t o2 = output[2U]; + uint64_t o3 = output[3U]; + uint64_t o4 = output[4U]; + uint64_t o5 = output[5U]; + uint64_t o6 = output[6U]; + uint64_t o7 = output[7U]; + return + ( + (__uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t){ + .fst = o00, + .snd = o1, + .thd = o2, + .f3 = o3, + .f4 = o4, + .f5 = o5, + .f6 = o6, + .f7 = o7 + } + ); +} + +static void load_block0(uint64_t *out, uint8_t *inp) +{ + uint8_t *b1 = inp; + uint8_t *b2 = inp + (uint32_t)8U; + uint64_t u0 = load64_le(b1); + uint64_t fst = u0; + uint64_t u1 = load64_le(b2); + uint64_t snd = u1; + uint64_t fst1 = Lib_Transposition64x8_transpose_bits64(fst); + uint64_t snd1 = Lib_Transposition64x8_transpose_bits64(snd); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint32_t sh = i * (uint32_t)8U; + uint64_t u = fst1 >> sh & (uint64_t)0xffU; + uint64_t u10 = u ^ (snd1 >> sh & (uint64_t)0xffU) << (uint32_t)8U; + out[i] = u10;); +} + +static void transpose_state(uint64_t *st) +{ + uint64_t i0 = st[0U]; + uint64_t i1 = st[1U]; + uint64_t i2 = st[2U]; + uint64_t i3 = st[3U]; + uint64_t i4 = st[4U]; + uint64_t i5 = st[5U]; + uint64_t i6 = st[6U]; + uint64_t i7 = st[7U]; + Lib_Transposition64x8_uint64x8 + scrut = + Lib_Transposition64x8_transpose_bits64x8(( + (Lib_Transposition64x8_uint64x8){ + .fst = { .fst = { .fst = i0, .snd = i1 }, .snd = { .fst = i2, .snd = i3 } }, + .snd = { .fst = { .fst = i4, .snd = i5 }, .snd = { .fst = i6, .snd = i7 } } + } + )); + uint64_t t7 = scrut.snd.snd.snd; + uint64_t t6 = scrut.snd.snd.fst; + uint64_t t5 = scrut.snd.fst.snd; + uint64_t t4 = scrut.snd.fst.fst; + uint64_t t3 = scrut.fst.snd.snd; + uint64_t t2 = scrut.fst.snd.fst; + uint64_t t1 = scrut.fst.fst.snd; + uint64_t t0 = scrut.fst.fst.fst; + st[0U] = t0; + st[1U] = t1; + st[2U] = t2; + st[3U] = t3; + st[4U] = t4; + st[5U] = t5; + st[6U] = t6; + st[7U] = t7; +} + +void Hacl_Impl_AES_CoreBitSlice_store_block0(uint8_t *out, uint64_t *inp) +{ + uint64_t i0 = inp[0U]; + uint64_t i1 = inp[1U]; + uint64_t i2 = inp[2U]; + uint64_t i3 = inp[3U]; + uint64_t i4 = inp[4U]; + uint64_t i5 = inp[5U]; + uint64_t i6 = inp[6U]; + uint64_t i7 = inp[7U]; + Lib_Transposition64x8_uint64x8 + scrut = + Lib_Transposition64x8_transpose_bits64x8(( + (Lib_Transposition64x8_uint64x8){ + .fst = { .fst = { .fst = i0, .snd = i1 }, .snd = { .fst = i2, .snd = i3 } }, + .snd = { .fst = { .fst = i4, .snd = i5 }, .snd = { .fst = i6, .snd = i7 } } + } + )); + uint64_t t1 = scrut.fst.fst.snd; + uint64_t t0 = scrut.fst.fst.fst; + store64_le(out, t0); + store64_le(out + (uint32_t)8U, t1); +} + +void Hacl_Impl_AES_CoreBitSlice_load_key1(uint64_t *out, uint8_t *k) +{ + load_block0(out, k); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t u = out[i]; + uint64_t u1 = u ^ u << (uint32_t)16U; + uint64_t u2 = u1 ^ u1 << (uint32_t)32U; + out[i] = u2;); +} + +void Hacl_Impl_AES_CoreBitSlice_load_nonce(uint64_t *out, uint8_t *nonce1) +{ + uint8_t nb[16U] = { 0U }; + memcpy(nb, nonce1, (uint32_t)12U * sizeof (uint8_t)); + Hacl_Impl_AES_CoreBitSlice_load_key1(out, nb); +} + +void Hacl_Impl_AES_CoreBitSlice_load_state(uint64_t *out, uint64_t *nonce1, uint32_t counter) +{ + uint8_t ctr[16U] = { 0U }; + store32_be(ctr, counter); + store32_be(ctr + (uint32_t)4U, counter + (uint32_t)1U); + store32_be(ctr + (uint32_t)8U, counter + (uint32_t)2U); + store32_be(ctr + (uint32_t)12U, counter + (uint32_t)3U); + load_block0(out, ctr); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t u = out[i]; + uint64_t + u1 = ((u << (uint32_t)12U | u << (uint32_t)24U) | u << (uint32_t)36U) | u << (uint32_t)48U; + uint64_t u2 = u1 & (uint64_t)0xf000f000f000f000U; + out[i] = u2 ^ nonce1[i];); +} + +void Hacl_Impl_AES_CoreBitSlice_xor_state_key1(uint64_t *st, uint64_t *ost) +{ + KRML_MAYBE_FOR8(i, (uint32_t)0U, (uint32_t)8U, (uint32_t)1U, st[i] = st[i] ^ ost[i];); +} + +static void xor_block(uint8_t *out, uint64_t *st, uint8_t *inp) +{ + transpose_state(st); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint8_t *ob = out + i * (uint32_t)8U; + uint8_t *ib = inp + i * (uint32_t)8U; + uint64_t u = load64_le(ib); + uint64_t u0 = u; + store64_le(ob, u0 ^ st[i]);); +} + +static void sub_bytes_state(uint64_t *st) +{ + uint64_t st0 = st[0U]; + uint64_t st1 = st[1U]; + uint64_t st2 = st[2U]; + uint64_t st3 = st[3U]; + uint64_t st4 = st[4U]; + uint64_t st5 = st[5U]; + uint64_t st6 = st[6U]; + uint64_t st7 = st[7U]; + __uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t_uint64_t + scrut = sub_bytes64x8(st0, st1, st2, st3, st4, st5, st6, st7); + uint64_t st01 = scrut.fst; + uint64_t st11 = scrut.snd; + uint64_t st21 = scrut.thd; + uint64_t st31 = scrut.f3; + uint64_t st41 = scrut.f4; + uint64_t st51 = scrut.f5; + uint64_t st61 = scrut.f6; + uint64_t st71 = scrut.f7; + st[0U] = st01; + st[1U] = st11; + st[2U] = st21; + st[3U] = st31; + st[4U] = st41; + st[5U] = st51; + st[6U] = st61; + st[7U] = st71; +} + +static void shift_rows_state(uint64_t *st) +{ + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t rowi = st[i]; + st[i] = + ((((((rowi & (uint64_t)0x1111111111111111U) + | (rowi & (uint64_t)0x2220222022202220U) >> (uint32_t)4U) + | (rowi & (uint64_t)0x0002000200020002U) << (uint32_t)12U) + | (rowi & (uint64_t)0x4400440044004400U) >> (uint32_t)8U) + | (rowi & (uint64_t)0x0044004400440044U) << (uint32_t)8U) + | (rowi & (uint64_t)0x8000800080008000U) >> (uint32_t)12U) + | (rowi & (uint64_t)0x0888088808880888U) << (uint32_t)4U;); +} + +static void mix_columns_state(uint64_t *st) +{ + uint64_t col[8U] = { 0U }; + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t coli = st[i]; + col[i] = + coli + ^ + ((coli & (uint64_t)0xeeeeeeeeeeeeeeeeU) + >> (uint32_t)1U + | (coli & (uint64_t)0x1111111111111111U) << (uint32_t)3U);); + uint64_t col0 = col[0U]; + uint64_t + ncol0 = + col0 + ^ + ((col0 & (uint64_t)0xccccccccccccccccU) + >> (uint32_t)2U + | (col0 & (uint64_t)0x3333333333333333U) << (uint32_t)2U); + st[0U] = st[0U] ^ ncol0; + KRML_MAYBE_FOR7(i, + (uint32_t)0U, + (uint32_t)7U, + (uint32_t)1U, + uint64_t prev = col[i]; + uint64_t next = col[i + (uint32_t)1U]; + uint64_t + ncoli = + next + ^ + ((next & (uint64_t)0xccccccccccccccccU) + >> (uint32_t)2U + | (next & (uint64_t)0x3333333333333333U) << (uint32_t)2U); + st[i + (uint32_t)1U] = st[i + (uint32_t)1U] ^ (ncoli ^ prev);); + st[0U] = st[0U] ^ col[7U]; + st[1U] = st[1U] ^ col[7U]; + st[3U] = st[3U] ^ col[7U]; + st[4U] = st[4U] ^ col[7U]; +} + +void Hacl_Impl_AES_CoreBitSlice_aes_enc(uint64_t *st, uint64_t *key) +{ + sub_bytes_state(st); + shift_rows_state(st); + mix_columns_state(st); + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, key); +} + +void Hacl_Impl_AES_CoreBitSlice_aes_enc_last(uint64_t *st, uint64_t *key) +{ + sub_bytes_state(st); + shift_rows_state(st); + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, key); +} + +void +Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(uint64_t *next, uint64_t *prev, uint8_t rcon1) +{ + memcpy(next, prev, (uint32_t)8U * sizeof (uint64_t)); + sub_bytes_state(next); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t u3 = next[i] & (uint64_t)0xf000f000f000f000U; + uint64_t n = u3 >> (uint32_t)12U; + uint64_t n1 = (n >> (uint32_t)1U | n << (uint32_t)3U) & (uint64_t)0x000f000f000f000fU; + uint64_t ri = (uint64_t)(rcon1 >> i & (uint8_t)1U); + uint64_t ri1 = ri ^ ri << (uint32_t)16U; + uint64_t ri2 = ri1 ^ ri1 << (uint32_t)32U; + uint64_t n2 = n1 ^ ri2; + uint64_t n3 = n2 << (uint32_t)12U; + next[i] = n3 ^ u3 >> (uint32_t)4U;); +} + +void Hacl_Impl_AES_CoreBitSlice_key_expansion_step(uint64_t *next, uint64_t *prev) +{ + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t p = prev[i]; + uint64_t n = next[i]; + uint64_t + p1 = + p + ^ + ((p & (uint64_t)0x0fff0fff0fff0fffU) + << (uint32_t)4U + ^ + ((p & (uint64_t)0x00ff00ff00ff00ffU) + << (uint32_t)8U + ^ (p & (uint64_t)0x000f000f000f000fU) << (uint32_t)12U)); + next[i] = n ^ p1;); +} + +void +Hacl_Impl_AES_Generic_aes128_ctr_bitslice( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t counter +) +{ + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = counter + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + uint64_t st[8U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, ctr); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)10U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR9(i0, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + uint64_t *sub_key = kr + i0 * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + xor_block(ob, st, ib); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = counter + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + uint64_t st[8U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, ctr); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)10U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + uint64_t *sub_key = kr + i * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + xor_block(last, st, last); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + +void +Hacl_Impl_AES_Generic_aes256_ctr_bitslice( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t counter +) +{ + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = counter + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + uint64_t st[8U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, ctr); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)14U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR13(i0, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + uint64_t *sub_key = kr + i0 * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + xor_block(ob, st, ib); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = counter + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + uint64_t st[8U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, ctr); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)14U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + uint64_t *sub_key = kr + i * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + xor_block(last, st, last); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + +void Hacl_AES_128_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce) +{ + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + uint32_t klen = (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_load_key1(kex, key); + uint64_t *prev = kex; + uint64_t *next = kex + klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next, prev, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next, prev); + uint64_t *prev1 = kex + klen; + uint64_t *next1 = kex + (uint32_t)2U * klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next1, prev1, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next1[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next1[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next1, prev1); + uint64_t *prev2 = kex + klen * (uint32_t)2U; + uint64_t *next2 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next2, prev2, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next2[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next2[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next2, prev2); + uint64_t *prev3 = kex + klen * (uint32_t)3U; + uint64_t *next3 = kex + klen * (uint32_t)4U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next3, prev3, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next3[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next3[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next3, prev3); + uint64_t *prev4 = kex + klen * (uint32_t)4U; + uint64_t *next4 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next4, prev4, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next4[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next4[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next4, prev4); + uint64_t *prev5 = kex + klen * (uint32_t)5U; + uint64_t *next5 = kex + klen * (uint32_t)6U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next5, prev5, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next5[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next5[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next5, prev5); + uint64_t *prev6 = kex + klen * (uint32_t)6U; + uint64_t *next6 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next6, prev6, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next6[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next6[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next6, prev6); + uint64_t *prev7 = kex + klen * (uint32_t)7U; + uint64_t *next7 = kex + klen * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next7, prev7, (uint8_t)0x80U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next7[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next7[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next7, prev7); + uint64_t *prev8 = kex + klen * (uint32_t)8U; + uint64_t *next8 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next8, prev8, (uint8_t)0x1bU); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next8[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next8[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next8, prev8); + uint64_t *prev9 = kex + klen * (uint32_t)9U; + uint64_t *next9 = kex + klen * (uint32_t)10U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next9, prev9, (uint8_t)0x36U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next9[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next9[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next9, prev9); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); +} + +void Hacl_AES_128_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce) +{ + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); +} + +void Hacl_AES_128_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter) +{ + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + uint64_t st[8U] = { 0U }; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, counter); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)10U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + uint64_t *sub_key = kr + i * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + Hacl_Impl_AES_CoreBitSlice_store_block0(kb, st); +} + +inline void +Hacl_AES_128_BitSlice_aes128_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + uint64_t ctx[96U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n1 = ctx; + uint32_t klen = (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_load_key1(kex, k); + uint64_t *prev = kex; + uint64_t *next = kex + klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next, prev, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next, prev); + uint64_t *prev1 = kex + klen; + uint64_t *next1 = kex + (uint32_t)2U * klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next1, prev1, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next1[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next1[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next1, prev1); + uint64_t *prev2 = kex + klen * (uint32_t)2U; + uint64_t *next2 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next2, prev2, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next2[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next2[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next2, prev2); + uint64_t *prev3 = kex + klen * (uint32_t)3U; + uint64_t *next3 = kex + klen * (uint32_t)4U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next3, prev3, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next3[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next3[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next3, prev3); + uint64_t *prev4 = kex + klen * (uint32_t)4U; + uint64_t *next4 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next4, prev4, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next4[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next4[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next4, prev4); + uint64_t *prev5 = kex + klen * (uint32_t)5U; + uint64_t *next5 = kex + klen * (uint32_t)6U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next5, prev5, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next5[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next5[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next5, prev5); + uint64_t *prev6 = kex + klen * (uint32_t)6U; + uint64_t *next6 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next6, prev6, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next6[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next6[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next6, prev6); + uint64_t *prev7 = kex + klen * (uint32_t)7U; + uint64_t *next7 = kex + klen * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next7, prev7, (uint8_t)0x80U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next7[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next7[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next7, prev7); + uint64_t *prev8 = kex + klen * (uint32_t)8U; + uint64_t *next8 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next8, prev8, (uint8_t)0x1bU); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next8[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next8[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next8, prev8); + uint64_t *prev9 = kex + klen * (uint32_t)9U; + uint64_t *next9 = kex + klen * (uint32_t)10U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next9, prev9, (uint8_t)0x36U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next9[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next9[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next9, prev9); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n1, n); + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, out, inp, ctx, c); +} + +inline void +Hacl_AES_128_BitSlice_aes128_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + uint64_t ctx[96U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n1 = ctx; + uint32_t klen = (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_load_key1(kex, k); + uint64_t *prev = kex; + uint64_t *next = kex + klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next, prev, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next, prev); + uint64_t *prev1 = kex + klen; + uint64_t *next1 = kex + (uint32_t)2U * klen; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next1, prev1, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next1[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next1[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next1, prev1); + uint64_t *prev2 = kex + klen * (uint32_t)2U; + uint64_t *next2 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next2, prev2, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next2[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next2[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next2, prev2); + uint64_t *prev3 = kex + klen * (uint32_t)3U; + uint64_t *next3 = kex + klen * (uint32_t)4U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next3, prev3, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next3[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next3[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next3, prev3); + uint64_t *prev4 = kex + klen * (uint32_t)4U; + uint64_t *next4 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next4, prev4, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next4[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next4[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next4, prev4); + uint64_t *prev5 = kex + klen * (uint32_t)5U; + uint64_t *next5 = kex + klen * (uint32_t)6U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next5, prev5, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next5[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next5[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next5, prev5); + uint64_t *prev6 = kex + klen * (uint32_t)6U; + uint64_t *next6 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next6, prev6, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next6[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next6[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next6, prev6); + uint64_t *prev7 = kex + klen * (uint32_t)7U; + uint64_t *next7 = kex + klen * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next7, prev7, (uint8_t)0x80U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next7[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next7[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next7, prev7); + uint64_t *prev8 = kex + klen * (uint32_t)8U; + uint64_t *next8 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next8, prev8, (uint8_t)0x1bU); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next8[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next8[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next8, prev8); + uint64_t *prev9 = kex + klen * (uint32_t)9U; + uint64_t *next9 = kex + klen * (uint32_t)10U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next9, prev9, (uint8_t)0x36U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next9[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next9[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next9, prev9); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n1, n); + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, out, inp, ctx, c); +} + diff --git a/src/msvc/Hacl_AES_128_GCM_M32.c b/src/msvc/Hacl_AES_128_GCM_M32.c new file mode 100644 index 00000000..bd172a0e --- /dev/null +++ b/src/msvc/Hacl_AES_128_GCM_M32.c @@ -0,0 +1,208 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_128_GCM_M32.h" + +#include "internal/Hacl_AES_128_BitSlice.h" + +uint32_t Hacl_AES_128_GCM_M32_aes_gcm_ctx_len = (uint32_t)396U; + +void Hacl_AES_128_GCM_M32_aes128_gcm_init(uint64_t *ctx, uint8_t *key) +{ + uint8_t gcm_key[16U] = { 0U }; + uint8_t nonce0[12U] = { 0U }; + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)128U; + Hacl_AES_128_BitSlice_aes128_init(aes_ctx, key, nonce0); + Hacl_AES_128_BitSlice_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Hacl_Gf128_PreComp_gcm_init(gcm_ctx, gcm_key); +} + +void +Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t tmp[16U] = { 0U }; + uint8_t *cip = out; + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)128U; + uint64_t *tag_mix = ctx + (uint32_t)394U; + uint32_t ctr; + uint8_t tag_mix10[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + uint64_t *aes_ctx1 = ctx; + Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, iv); + Hacl_AES_128_BitSlice_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); + uint64_t u = load64_le(tag_mix10); + ctx[394U] = u; + uint64_t u0 = load64_le(tag_mix10 + (uint32_t)8U); + ctx[395U] = u0; + ctr = (uint32_t)2U; + } + else + { + uint64_t *aes_ctx1 = ctx; + uint64_t *gcm_ctx1 = ctx + (uint32_t)128U; + store64_be(gcm_key + (uint32_t)8U, gcm_ctx1[8U]); + store64_be(gcm_key, gcm_ctx1[9U]); + Hacl_Gf128_PreComp_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_PreComp_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, tag_iv); + uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u0; + Hacl_AES_128_BitSlice_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + uint64_t u = load64_le(tag_mix1); + ctx[394U] = u; + uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); + ctx[395U] = u1; + ctr = ctr0 + (uint32_t)1U; + } + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, cip, text, aes_ctx, ctr); + Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, len, cip); + store64_be(tmp, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(tmp + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_PreComp_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); + Hacl_Gf128_PreComp_gcm_emit(tmp, gcm_ctx); + uint64_t u0 = load64_le(tmp); + uint64_t tmp0 = u0; + uint64_t u = load64_le(tmp + (uint32_t)8U); + uint64_t tmp1 = u; + uint64_t tmp01 = tmp0 ^ tag_mix[0U]; + uint64_t tmp11 = tmp1 ^ tag_mix[1U]; + store64_le(out + len, tmp01); + store64_le(out + len + (uint32_t)8U, tmp11); + gcm_ctx[0U] = (uint64_t)0U; + gcm_ctx[1U] = (uint64_t)0U; +} + +bool +Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t scratch[18U] = { 0U }; + uint8_t *text = scratch; + uint8_t *result = scratch + (uint32_t)17U; + uint8_t *ciphertext = cipher; + uint8_t *tag = cipher + len; + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)128U; + uint64_t *tag_mix = ctx + (uint32_t)394U; + uint32_t ctr; + uint8_t tag_mix10[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + uint64_t *aes_ctx1 = ctx; + Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, iv); + Hacl_AES_128_BitSlice_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); + uint64_t u = load64_le(tag_mix10); + ctx[394U] = u; + uint64_t u0 = load64_le(tag_mix10 + (uint32_t)8U); + ctx[395U] = u0; + ctr = (uint32_t)2U; + } + else + { + uint64_t *aes_ctx1 = ctx; + uint64_t *gcm_ctx1 = ctx + (uint32_t)128U; + store64_be(gcm_key + (uint32_t)8U, gcm_ctx1[8U]); + store64_be(gcm_key, gcm_ctx1[9U]); + Hacl_Gf128_PreComp_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_PreComp_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, tag_iv); + uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u0; + Hacl_AES_128_BitSlice_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + uint64_t u = load64_le(tag_mix1); + ctx[394U] = u; + uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); + ctx[395U] = u1; + ctr = ctr0 + (uint32_t)1U; + } + Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, len, ciphertext); + store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(text + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_PreComp_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); + Hacl_Gf128_PreComp_gcm_emit(text, gcm_ctx); + uint64_t u0 = load64_le(text); + uint64_t text0 = u0; + uint64_t u = load64_le(text + (uint32_t)8U); + uint64_t text1 = u; + uint64_t text01 = text0 ^ tag_mix[0U]; + uint64_t text11 = text1 ^ tag_mix[1U]; + store64_le(text, text01); + store64_le(text + (uint32_t)8U, text11); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + result[0U] = result[0U] | (text[i] ^ tag[i]);); + uint8_t res8 = result[0U]; + if (res8 == (uint8_t)0U) + { + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, out, ciphertext, aes_ctx, ctr); + return true; + } + return false; +} + diff --git a/src/msvc/Hacl_AES_128_GCM_NI.c b/src/msvc/Hacl_AES_128_GCM_NI.c new file mode 100644 index 00000000..16e03251 --- /dev/null +++ b/src/msvc/Hacl_AES_128_GCM_NI.c @@ -0,0 +1,409 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_128_GCM_NI.h" + +void Hacl_AES_128_GCM_NI_aes128_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key) +{ + uint8_t gcm_key[16U] = { 0U }; + uint8_t nonce0[12U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Hacl_AES_128_NI_aes128_init(aes_ctx, key, nonce0); + Hacl_AES_128_NI_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Hacl_Gf128_NI_gcm_init(gcm_ctx, gcm_key); +} + +void +Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint32_t ctr; + uint8_t tag_mix0[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Hacl_AES_128_NI_aes128_set_nonce(aes_ctx, iv); + Hacl_AES_128_NI_aes128_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); + ctr = (uint32_t)2U; + } + else + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx[4U]); + Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_NI_aes128_set_nonce(aes_ctx, tag_iv); + uint32_t u = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u; + Hacl_AES_128_NI_aes128_key_block(tag_mix1, aes_ctx, ctr0); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + ctr = ctr0 + (uint32_t)1U; + } + uint8_t *cip = out; + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr1 = ctr + i * (uint32_t)4U; + uint8_t *ib = text + i * (uint32_t)64U; + uint8_t *ob = cip + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; + uint32_t counter = ctr1; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i0, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr1 = ctr + blocks64 * (uint32_t)4U; + uint8_t *ib = text + blocks64 * (uint32_t)64U; + uint8_t *ob = cip + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; + uint32_t counter = ctr1; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, cip); + uint8_t tmp[16U] = { 0U }; + store64_be(tmp, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(tmp + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_NI_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); + Hacl_Gf128_NI_gcm_emit(tmp, gcm_ctx); + Lib_IntVector_Intrinsics_vec128 tmp_vec = Lib_IntVector_Intrinsics_vec128_load128_le(tmp); + Lib_IntVector_Intrinsics_vec128 + tmp_vec1 = Lib_IntVector_Intrinsics_vec128_xor(tmp_vec, tag_mix); + Lib_IntVector_Intrinsics_vec128_store128_le(out + len, tmp_vec1); + gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; +} + +bool +Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t scratch[18U] = { 0U }; + uint8_t *text = scratch; + uint8_t *result = scratch + (uint32_t)17U; + uint8_t *ciphertext = cipher; + uint8_t *tag = cipher + len; + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; + uint32_t ctr; + uint8_t tag_mix10[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx1 = ctx; + Hacl_AES_128_NI_aes128_set_nonce(aes_ctx1, iv); + Hacl_AES_128_NI_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix10); + ctr = (uint32_t)2U; + } + else + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx1 = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx1 = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx1[4U]); + Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_NI_aes128_set_nonce(aes_ctx1, tag_iv); + uint32_t u = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u; + Hacl_AES_128_NI_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + ctr = ctr0 + (uint32_t)1U; + } + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, ciphertext); + store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(text + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_NI_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); + Hacl_Gf128_NI_gcm_emit(text, gcm_ctx); + Lib_IntVector_Intrinsics_vec128 text_vec = Lib_IntVector_Intrinsics_vec128_load128_le(text); + Lib_IntVector_Intrinsics_vec128 + text_vec1 = Lib_IntVector_Intrinsics_vec128_xor(text_vec, tag_mix); + Lib_IntVector_Intrinsics_vec128_store128_le(text, text_vec1); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + result[0U] = result[0U] | (text[i] ^ tag[i]);); + uint8_t res8 = result[0U]; + if (res8 == (uint8_t)0U) + { + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr1 = ctr + i * (uint32_t)4U; + uint8_t *ib = ciphertext + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; + uint32_t counter = ctr1; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i0, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr1 = ctr + blocks64 * (uint32_t)4U; + uint8_t *ib = ciphertext + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; + uint32_t counter = ctr1; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } + return true; + } + return false; +} + diff --git a/src/msvc/Hacl_AES_128_NI.c b/src/msvc/Hacl_AES_128_NI.c new file mode 100644 index 00000000..4a9d9ca8 --- /dev/null +++ b/src/msvc/Hacl_AES_128_NI.c @@ -0,0 +1,1084 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_128_NI.h" + +void +Hacl_AES_128_NI_aes128_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key, uint8_t *nonce) +{ + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *uu____0 = kex; + uu____0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(key); + Lib_IntVector_Intrinsics_vec128 *prev = kex; + Lib_IntVector_Intrinsics_vec128 *next = kex + klen; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev[0U], (uint8_t)0x01U); + next[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key1 = prev[0U]; + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key4 = + Lib_IntVector_Intrinsics_vec128_xor(key3, + Lib_IntVector_Intrinsics_vec128_shift_left(key3, (uint32_t)32U)); + next[0U] = Lib_IntVector_Intrinsics_vec128_xor(next[0U], key4); + Lib_IntVector_Intrinsics_vec128 *prev1 = kex + klen; + Lib_IntVector_Intrinsics_vec128 *next1 = kex + (uint32_t)2U * klen; + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x02U); + next1[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v1, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key10 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key40 = + Lib_IntVector_Intrinsics_vec128_xor(key30, + Lib_IntVector_Intrinsics_vec128_shift_left(key30, (uint32_t)32U)); + next1[0U] = Lib_IntVector_Intrinsics_vec128_xor(next1[0U], key40); + Lib_IntVector_Intrinsics_vec128 *prev2 = kex + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next2 = kex + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev2[0U], (uint8_t)0x04U); + next2[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v2, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key11 = prev2[0U]; + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key41 = + Lib_IntVector_Intrinsics_vec128_xor(key31, + Lib_IntVector_Intrinsics_vec128_shift_left(key31, (uint32_t)32U)); + next2[0U] = Lib_IntVector_Intrinsics_vec128_xor(next2[0U], key41); + Lib_IntVector_Intrinsics_vec128 *prev3 = kex + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 *next3 = kex + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev3[0U], (uint8_t)0x08U); + next3[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v3, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key12 = prev3[0U]; + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key42 = + Lib_IntVector_Intrinsics_vec128_xor(key32, + Lib_IntVector_Intrinsics_vec128_shift_left(key32, (uint32_t)32U)); + next3[0U] = Lib_IntVector_Intrinsics_vec128_xor(next3[0U], key42); + Lib_IntVector_Intrinsics_vec128 *prev4 = kex + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next4 = kex + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev4[0U], (uint8_t)0x10U); + next4[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key13 = prev4[0U]; + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key43 = + Lib_IntVector_Intrinsics_vec128_xor(key33, + Lib_IntVector_Intrinsics_vec128_shift_left(key33, (uint32_t)32U)); + next4[0U] = Lib_IntVector_Intrinsics_vec128_xor(next4[0U], key43); + Lib_IntVector_Intrinsics_vec128 *prev5 = kex + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 *next5 = kex + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev5[0U], (uint8_t)0x20U); + next5[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key14 = prev5[0U]; + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key44 = + Lib_IntVector_Intrinsics_vec128_xor(key34, + Lib_IntVector_Intrinsics_vec128_shift_left(key34, (uint32_t)32U)); + next5[0U] = Lib_IntVector_Intrinsics_vec128_xor(next5[0U], key44); + Lib_IntVector_Intrinsics_vec128 *prev6 = kex + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next6 = kex + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev6[0U], (uint8_t)0x40U); + next6[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key15 = prev6[0U]; + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key45 = + Lib_IntVector_Intrinsics_vec128_xor(key35, + Lib_IntVector_Intrinsics_vec128_shift_left(key35, (uint32_t)32U)); + next6[0U] = Lib_IntVector_Intrinsics_vec128_xor(next6[0U], key45); + Lib_IntVector_Intrinsics_vec128 *prev7 = kex + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 *next7 = kex + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev7[0U], (uint8_t)0x80U); + next7[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key16 = prev7[0U]; + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key46 = + Lib_IntVector_Intrinsics_vec128_xor(key36, + Lib_IntVector_Intrinsics_vec128_shift_left(key36, (uint32_t)32U)); + next7[0U] = Lib_IntVector_Intrinsics_vec128_xor(next7[0U], key46); + Lib_IntVector_Intrinsics_vec128 *prev8 = kex + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next8 = kex + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev8[0U], (uint8_t)0x1bU); + next8[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev8[0U]; + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key47 = + Lib_IntVector_Intrinsics_vec128_xor(key37, + Lib_IntVector_Intrinsics_vec128_shift_left(key37, (uint32_t)32U)); + next8[0U] = Lib_IntVector_Intrinsics_vec128_xor(next8[0U], key47); + Lib_IntVector_Intrinsics_vec128 *prev9 = kex + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 *next9 = kex + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev9[0U], (uint8_t)0x36U); + next9[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key18 = prev9[0U]; + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key48 = + Lib_IntVector_Intrinsics_vec128_xor(key38, + Lib_IntVector_Intrinsics_vec128_shift_left(key38, (uint32_t)32U)); + next9[0U] = Lib_IntVector_Intrinsics_vec128_xor(next9[0U], key48); + uint8_t nb[16U] = { 0U }; + memcpy(nb, nonce, (uint32_t)12U * sizeof (uint8_t)); + n[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); +} + +void Hacl_AES_128_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce) +{ + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint8_t nb[16U] = { 0U }; + memcpy(nb, nonce, (uint32_t)12U * sizeof (uint8_t)); + n[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); +} + +void +Hacl_AES_128_NI_aes128_key_block( + uint8_t *kb, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t counter +) +{ + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + uint32_t counter1 = counter; + uint32_t counter0 = htobe32(counter1); + uint32_t counter11 = htobe32(counter1 + (uint32_t)1U); + uint32_t counter2 = htobe32(counter1 + (uint32_t)2U); + uint32_t counter3 = htobe32(counter1 + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter11, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128_store128_le(kb, st[0U]); +} + +inline void +Hacl_AES_128_NI_aes128_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 ctx[12U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex0 = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n10 = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *uu____0 = kex0; + uu____0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k); + Lib_IntVector_Intrinsics_vec128 *prev = kex0; + Lib_IntVector_Intrinsics_vec128 *next = kex0 + klen; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev[0U], (uint8_t)0x01U); + next[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key = prev[0U]; + Lib_IntVector_Intrinsics_vec128 + key1 = + Lib_IntVector_Intrinsics_vec128_xor(key, + Lib_IntVector_Intrinsics_vec128_shift_left(key, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + next[0U] = Lib_IntVector_Intrinsics_vec128_xor(next[0U], key3); + Lib_IntVector_Intrinsics_vec128 *prev1 = kex0 + klen; + Lib_IntVector_Intrinsics_vec128 *next1 = kex0 + (uint32_t)2U * klen; + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x02U); + next1[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key0 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key10 = + Lib_IntVector_Intrinsics_vec128_xor(key0, + Lib_IntVector_Intrinsics_vec128_shift_left(key0, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + next1[0U] = Lib_IntVector_Intrinsics_vec128_xor(next1[0U], key30); + Lib_IntVector_Intrinsics_vec128 *prev2 = kex0 + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next2 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev2[0U], (uint8_t)0x04U); + next2[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key4 = prev2[0U]; + Lib_IntVector_Intrinsics_vec128 + key11 = + Lib_IntVector_Intrinsics_vec128_xor(key4, + Lib_IntVector_Intrinsics_vec128_shift_left(key4, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + next2[0U] = Lib_IntVector_Intrinsics_vec128_xor(next2[0U], key31); + Lib_IntVector_Intrinsics_vec128 *prev3 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 *next3 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev3[0U], (uint8_t)0x08U); + next3[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key5 = prev3[0U]; + Lib_IntVector_Intrinsics_vec128 + key12 = + Lib_IntVector_Intrinsics_vec128_xor(key5, + Lib_IntVector_Intrinsics_vec128_shift_left(key5, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + next3[0U] = Lib_IntVector_Intrinsics_vec128_xor(next3[0U], key32); + Lib_IntVector_Intrinsics_vec128 *prev4 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next4 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev4[0U], (uint8_t)0x10U); + next4[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key6 = prev4[0U]; + Lib_IntVector_Intrinsics_vec128 + key13 = + Lib_IntVector_Intrinsics_vec128_xor(key6, + Lib_IntVector_Intrinsics_vec128_shift_left(key6, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + next4[0U] = Lib_IntVector_Intrinsics_vec128_xor(next4[0U], key33); + Lib_IntVector_Intrinsics_vec128 *prev5 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 *next5 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev5[0U], (uint8_t)0x20U); + next5[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key7 = prev5[0U]; + Lib_IntVector_Intrinsics_vec128 + key14 = + Lib_IntVector_Intrinsics_vec128_xor(key7, + Lib_IntVector_Intrinsics_vec128_shift_left(key7, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + next5[0U] = Lib_IntVector_Intrinsics_vec128_xor(next5[0U], key34); + Lib_IntVector_Intrinsics_vec128 *prev6 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next6 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v9 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev6[0U], (uint8_t)0x40U); + next6[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v9, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key8 = prev6[0U]; + Lib_IntVector_Intrinsics_vec128 + key15 = + Lib_IntVector_Intrinsics_vec128_xor(key8, + Lib_IntVector_Intrinsics_vec128_shift_left(key8, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + next6[0U] = Lib_IntVector_Intrinsics_vec128_xor(next6[0U], key35); + Lib_IntVector_Intrinsics_vec128 *prev7 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 *next7 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 + v10 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev7[0U], (uint8_t)0x80U); + next7[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v10, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key9 = prev7[0U]; + Lib_IntVector_Intrinsics_vec128 + key16 = + Lib_IntVector_Intrinsics_vec128_xor(key9, + Lib_IntVector_Intrinsics_vec128_shift_left(key9, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + next7[0U] = Lib_IntVector_Intrinsics_vec128_xor(next7[0U], key36); + Lib_IntVector_Intrinsics_vec128 *prev8 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next8 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v12 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev8[0U], (uint8_t)0x1bU); + next8[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v12, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev8[0U]; + Lib_IntVector_Intrinsics_vec128 + key18 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + next8[0U] = Lib_IntVector_Intrinsics_vec128_xor(next8[0U], key37); + Lib_IntVector_Intrinsics_vec128 *prev9 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 *next9 = kex0 + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev9[0U], (uint8_t)0x36U); + next9[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key19 = prev9[0U]; + Lib_IntVector_Intrinsics_vec128 + key110 = + Lib_IntVector_Intrinsics_vec128_xor(key19, + Lib_IntVector_Intrinsics_vec128_shift_left(key19, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key110, + Lib_IntVector_Intrinsics_vec128_shift_left(key110, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + next9[0U] = Lib_IntVector_Intrinsics_vec128_xor(next9[0U], key38); + uint8_t nb[16U] = { 0U }; + memcpy(nb, n, (uint32_t)12U * sizeof (uint8_t)); + n10[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = c + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter = ctr; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i0, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = c + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter = ctr; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + +inline void +Hacl_AES_128_NI_aes128_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 ctx[12U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex0 = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n10 = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *uu____0 = kex0; + uu____0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k); + Lib_IntVector_Intrinsics_vec128 *prev = kex0; + Lib_IntVector_Intrinsics_vec128 *next = kex0 + klen; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev[0U], (uint8_t)0x01U); + next[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key = prev[0U]; + Lib_IntVector_Intrinsics_vec128 + key1 = + Lib_IntVector_Intrinsics_vec128_xor(key, + Lib_IntVector_Intrinsics_vec128_shift_left(key, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + next[0U] = Lib_IntVector_Intrinsics_vec128_xor(next[0U], key3); + Lib_IntVector_Intrinsics_vec128 *prev1 = kex0 + klen; + Lib_IntVector_Intrinsics_vec128 *next1 = kex0 + (uint32_t)2U * klen; + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x02U); + next1[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key0 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key10 = + Lib_IntVector_Intrinsics_vec128_xor(key0, + Lib_IntVector_Intrinsics_vec128_shift_left(key0, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + next1[0U] = Lib_IntVector_Intrinsics_vec128_xor(next1[0U], key30); + Lib_IntVector_Intrinsics_vec128 *prev2 = kex0 + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next2 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev2[0U], (uint8_t)0x04U); + next2[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key4 = prev2[0U]; + Lib_IntVector_Intrinsics_vec128 + key11 = + Lib_IntVector_Intrinsics_vec128_xor(key4, + Lib_IntVector_Intrinsics_vec128_shift_left(key4, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + next2[0U] = Lib_IntVector_Intrinsics_vec128_xor(next2[0U], key31); + Lib_IntVector_Intrinsics_vec128 *prev3 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 *next3 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev3[0U], (uint8_t)0x08U); + next3[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key5 = prev3[0U]; + Lib_IntVector_Intrinsics_vec128 + key12 = + Lib_IntVector_Intrinsics_vec128_xor(key5, + Lib_IntVector_Intrinsics_vec128_shift_left(key5, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + next3[0U] = Lib_IntVector_Intrinsics_vec128_xor(next3[0U], key32); + Lib_IntVector_Intrinsics_vec128 *prev4 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next4 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev4[0U], (uint8_t)0x10U); + next4[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key6 = prev4[0U]; + Lib_IntVector_Intrinsics_vec128 + key13 = + Lib_IntVector_Intrinsics_vec128_xor(key6, + Lib_IntVector_Intrinsics_vec128_shift_left(key6, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + next4[0U] = Lib_IntVector_Intrinsics_vec128_xor(next4[0U], key33); + Lib_IntVector_Intrinsics_vec128 *prev5 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 *next5 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev5[0U], (uint8_t)0x20U); + next5[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key7 = prev5[0U]; + Lib_IntVector_Intrinsics_vec128 + key14 = + Lib_IntVector_Intrinsics_vec128_xor(key7, + Lib_IntVector_Intrinsics_vec128_shift_left(key7, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + next5[0U] = Lib_IntVector_Intrinsics_vec128_xor(next5[0U], key34); + Lib_IntVector_Intrinsics_vec128 *prev6 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next6 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v9 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev6[0U], (uint8_t)0x40U); + next6[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v9, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key8 = prev6[0U]; + Lib_IntVector_Intrinsics_vec128 + key15 = + Lib_IntVector_Intrinsics_vec128_xor(key8, + Lib_IntVector_Intrinsics_vec128_shift_left(key8, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + next6[0U] = Lib_IntVector_Intrinsics_vec128_xor(next6[0U], key35); + Lib_IntVector_Intrinsics_vec128 *prev7 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 *next7 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 + v10 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev7[0U], (uint8_t)0x80U); + next7[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v10, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key9 = prev7[0U]; + Lib_IntVector_Intrinsics_vec128 + key16 = + Lib_IntVector_Intrinsics_vec128_xor(key9, + Lib_IntVector_Intrinsics_vec128_shift_left(key9, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + next7[0U] = Lib_IntVector_Intrinsics_vec128_xor(next7[0U], key36); + Lib_IntVector_Intrinsics_vec128 *prev8 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next8 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v12 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev8[0U], (uint8_t)0x1bU); + next8[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v12, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev8[0U]; + Lib_IntVector_Intrinsics_vec128 + key18 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + next8[0U] = Lib_IntVector_Intrinsics_vec128_xor(next8[0U], key37); + Lib_IntVector_Intrinsics_vec128 *prev9 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 *next9 = kex0 + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev9[0U], (uint8_t)0x36U); + next9[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key19 = prev9[0U]; + Lib_IntVector_Intrinsics_vec128 + key110 = + Lib_IntVector_Intrinsics_vec128_xor(key19, + Lib_IntVector_Intrinsics_vec128_shift_left(key19, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key110, + Lib_IntVector_Intrinsics_vec128_shift_left(key110, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + next9[0U] = Lib_IntVector_Intrinsics_vec128_xor(next9[0U], key38); + uint8_t nb[16U] = { 0U }; + memcpy(nb, n, (uint32_t)12U * sizeof (uint8_t)); + n10[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = c + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter = ctr; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i0, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = c + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter = ctr; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)10U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR9(i, + (uint32_t)0U, + (uint32_t)9U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + diff --git a/src/msvc/Hacl_Gf128_NI.c b/src/msvc/Hacl_Gf128_NI.c new file mode 100644 index 00000000..3747dd87 --- /dev/null +++ b/src/msvc/Hacl_Gf128_NI.c @@ -0,0 +1,359 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_Gf128_NI.h" + +static inline void +fadd0(Lib_IntVector_Intrinsics_vec128 *x, Lib_IntVector_Intrinsics_vec128 *y) +{ + x[0U] = Lib_IntVector_Intrinsics_vec128_xor(x[0U], y[0U]); +} + +static inline void +fmul0(Lib_IntVector_Intrinsics_vec128 *x, Lib_IntVector_Intrinsics_vec128 *y) +{ + Lib_IntVector_Intrinsics_vec128 xe = x[0U]; + Lib_IntVector_Intrinsics_vec128 ye = y[0U]; + Lib_IntVector_Intrinsics_vec128 + lo0 = Lib_IntVector_Intrinsics_ni_clmul(xe, ye, (uint8_t)0x00U); + Lib_IntVector_Intrinsics_vec128 m1 = Lib_IntVector_Intrinsics_ni_clmul(xe, ye, (uint8_t)0x10U); + Lib_IntVector_Intrinsics_vec128 m2 = Lib_IntVector_Intrinsics_ni_clmul(xe, ye, (uint8_t)0x01U); + Lib_IntVector_Intrinsics_vec128 hi = Lib_IntVector_Intrinsics_ni_clmul(xe, ye, (uint8_t)0x11U); + Lib_IntVector_Intrinsics_vec128 m11 = Lib_IntVector_Intrinsics_vec128_xor(m1, m2); + Lib_IntVector_Intrinsics_vec128 + m21 = Lib_IntVector_Intrinsics_vec128_shift_left(m11, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + m12 = Lib_IntVector_Intrinsics_vec128_shift_right(m11, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 lo10 = Lib_IntVector_Intrinsics_vec128_xor(lo0, m21); + Lib_IntVector_Intrinsics_vec128 hi10 = Lib_IntVector_Intrinsics_vec128_xor(hi, m12); + Lib_IntVector_Intrinsics_vec128 hi0 = hi10; + Lib_IntVector_Intrinsics_vec128 lo = lo10; + Lib_IntVector_Intrinsics_vec128 + lo1 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo2 = Lib_IntVector_Intrinsics_vec128_shift_left(lo1, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + lo3 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 lo31 = Lib_IntVector_Intrinsics_vec128_xor(lo3, lo2); + Lib_IntVector_Intrinsics_vec128 + hi1 = Lib_IntVector_Intrinsics_vec128_shift_right64(hi0, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + hi11 = Lib_IntVector_Intrinsics_vec128_shift_left(hi1, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + hi2 = Lib_IntVector_Intrinsics_vec128_shift_left64(hi0, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 hi21 = Lib_IntVector_Intrinsics_vec128_xor(hi2, hi11); + Lib_IntVector_Intrinsics_vec128 + lo11 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo12 = Lib_IntVector_Intrinsics_vec128_shift_right(lo11, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 hi22 = Lib_IntVector_Intrinsics_vec128_xor(hi21, lo12); + Lib_IntVector_Intrinsics_vec128 lo4 = lo31; + Lib_IntVector_Intrinsics_vec128 hi3 = hi22; + Lib_IntVector_Intrinsics_vec128 + lo13 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo21 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)62U); + Lib_IntVector_Intrinsics_vec128 + lo32 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)57U); + Lib_IntVector_Intrinsics_vec128 lo14 = Lib_IntVector_Intrinsics_vec128_xor(lo13, lo21); + Lib_IntVector_Intrinsics_vec128 lo15 = Lib_IntVector_Intrinsics_vec128_xor(lo14, lo32); + Lib_IntVector_Intrinsics_vec128 + lo22 = Lib_IntVector_Intrinsics_vec128_shift_right(lo15, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + lo33 = Lib_IntVector_Intrinsics_vec128_shift_left(lo15, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 lo5 = Lib_IntVector_Intrinsics_vec128_xor(lo4, lo33); + Lib_IntVector_Intrinsics_vec128 lo_ = lo22; + Lib_IntVector_Intrinsics_vec128 + lo16 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo5, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 + lo23 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo5, (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 + lo34 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo5, (uint32_t)7U); + Lib_IntVector_Intrinsics_vec128 lo17 = Lib_IntVector_Intrinsics_vec128_xor(lo16, lo23); + Lib_IntVector_Intrinsics_vec128 lo18 = Lib_IntVector_Intrinsics_vec128_xor(lo17, lo34); + Lib_IntVector_Intrinsics_vec128 lo19 = Lib_IntVector_Intrinsics_vec128_xor(lo18, lo_); + Lib_IntVector_Intrinsics_vec128 lo6 = Lib_IntVector_Intrinsics_vec128_xor(lo5, lo19); + Lib_IntVector_Intrinsics_vec128 lo7 = Lib_IntVector_Intrinsics_vec128_xor(lo6, hi3); + Lib_IntVector_Intrinsics_vec128 lo110 = lo7; + x[0U] = lo110; +} + +static inline void load_precompute_r(Lib_IntVector_Intrinsics_vec128 *pre, uint8_t *key) +{ + Lib_IntVector_Intrinsics_vec128 *r4 = pre; + Lib_IntVector_Intrinsics_vec128 *r3 = pre + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *r2 = pre + (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *r1 = pre + (uint32_t)3U; + r1[0U] = Lib_IntVector_Intrinsics_vec128_load_be(key); + r4[0U] = r1[0U]; + r3[0U] = r1[0U]; + r2[0U] = r1[0U]; + fmul0(r2, r1); + fmul0(r3, r2); + fmul0(r4, r3); +} + +static inline void +normalize4( + Lib_IntVector_Intrinsics_vec128 *acc, + Lib_IntVector_Intrinsics_vec128 *x, + Lib_IntVector_Intrinsics_vec128 *pre +) +{ + Lib_IntVector_Intrinsics_vec128 x1 = x[0U]; + Lib_IntVector_Intrinsics_vec128 x2 = x[1U]; + Lib_IntVector_Intrinsics_vec128 x3 = x[2U]; + Lib_IntVector_Intrinsics_vec128 x4 = x[3U]; + Lib_IntVector_Intrinsics_vec128 y1 = pre[0U]; + Lib_IntVector_Intrinsics_vec128 y2 = pre[1U]; + Lib_IntVector_Intrinsics_vec128 y3 = pre[2U]; + Lib_IntVector_Intrinsics_vec128 y4 = pre[3U]; + Lib_IntVector_Intrinsics_vec128 + lo10 = Lib_IntVector_Intrinsics_ni_clmul(x1, y1, (uint8_t)0x00U); + Lib_IntVector_Intrinsics_vec128 + lo2 = Lib_IntVector_Intrinsics_ni_clmul(x2, y2, (uint8_t)0x00U); + Lib_IntVector_Intrinsics_vec128 + lo30 = Lib_IntVector_Intrinsics_ni_clmul(x3, y3, (uint8_t)0x00U); + Lib_IntVector_Intrinsics_vec128 + lo40 = Lib_IntVector_Intrinsics_ni_clmul(x4, y4, (uint8_t)0x00U); + Lib_IntVector_Intrinsics_vec128 lo0 = Lib_IntVector_Intrinsics_vec128_xor(lo10, lo2); + Lib_IntVector_Intrinsics_vec128 lo5 = Lib_IntVector_Intrinsics_vec128_xor(lo0, lo30); + Lib_IntVector_Intrinsics_vec128 lo6 = Lib_IntVector_Intrinsics_vec128_xor(lo5, lo40); + Lib_IntVector_Intrinsics_vec128 m1 = Lib_IntVector_Intrinsics_ni_clmul(x1, y1, (uint8_t)0x10U); + Lib_IntVector_Intrinsics_vec128 m2 = Lib_IntVector_Intrinsics_ni_clmul(x2, y2, (uint8_t)0x10U); + Lib_IntVector_Intrinsics_vec128 m3 = Lib_IntVector_Intrinsics_ni_clmul(x3, y3, (uint8_t)0x10U); + Lib_IntVector_Intrinsics_vec128 m4 = Lib_IntVector_Intrinsics_ni_clmul(x4, y4, (uint8_t)0x10U); + Lib_IntVector_Intrinsics_vec128 m = Lib_IntVector_Intrinsics_vec128_xor(m1, m2); + Lib_IntVector_Intrinsics_vec128 m5 = Lib_IntVector_Intrinsics_vec128_xor(m, m3); + Lib_IntVector_Intrinsics_vec128 m6 = Lib_IntVector_Intrinsics_vec128_xor(m5, m4); + Lib_IntVector_Intrinsics_vec128 + m11 = Lib_IntVector_Intrinsics_ni_clmul(x1, y1, (uint8_t)0x01U); + Lib_IntVector_Intrinsics_vec128 + m21 = Lib_IntVector_Intrinsics_ni_clmul(x2, y2, (uint8_t)0x01U); + Lib_IntVector_Intrinsics_vec128 + m31 = Lib_IntVector_Intrinsics_ni_clmul(x3, y3, (uint8_t)0x01U); + Lib_IntVector_Intrinsics_vec128 + m41 = Lib_IntVector_Intrinsics_ni_clmul(x4, y4, (uint8_t)0x01U); + Lib_IntVector_Intrinsics_vec128 m7 = Lib_IntVector_Intrinsics_vec128_xor(m6, m11); + Lib_IntVector_Intrinsics_vec128 m8 = Lib_IntVector_Intrinsics_vec128_xor(m7, m21); + Lib_IntVector_Intrinsics_vec128 m9 = Lib_IntVector_Intrinsics_vec128_xor(m8, m31); + Lib_IntVector_Intrinsics_vec128 m10 = Lib_IntVector_Intrinsics_vec128_xor(m9, m41); + Lib_IntVector_Intrinsics_vec128 + hi10 = Lib_IntVector_Intrinsics_ni_clmul(x1, y1, (uint8_t)0x11U); + Lib_IntVector_Intrinsics_vec128 + hi20 = Lib_IntVector_Intrinsics_ni_clmul(x2, y2, (uint8_t)0x11U); + Lib_IntVector_Intrinsics_vec128 + hi30 = Lib_IntVector_Intrinsics_ni_clmul(x3, y3, (uint8_t)0x11U); + Lib_IntVector_Intrinsics_vec128 + hi4 = Lib_IntVector_Intrinsics_ni_clmul(x4, y4, (uint8_t)0x11U); + Lib_IntVector_Intrinsics_vec128 hi = Lib_IntVector_Intrinsics_vec128_xor(hi10, hi20); + Lib_IntVector_Intrinsics_vec128 hi5 = Lib_IntVector_Intrinsics_vec128_xor(hi, hi30); + Lib_IntVector_Intrinsics_vec128 hi6 = Lib_IntVector_Intrinsics_vec128_xor(hi5, hi4); + Lib_IntVector_Intrinsics_vec128 + m12 = Lib_IntVector_Intrinsics_vec128_shift_left(m10, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + m22 = Lib_IntVector_Intrinsics_vec128_shift_right(m10, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 lo7 = Lib_IntVector_Intrinsics_vec128_xor(lo6, m12); + Lib_IntVector_Intrinsics_vec128 hi7 = Lib_IntVector_Intrinsics_vec128_xor(hi6, m22); + Lib_IntVector_Intrinsics_vec128 hi0 = hi7; + Lib_IntVector_Intrinsics_vec128 lo = lo7; + Lib_IntVector_Intrinsics_vec128 + lo1 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo20 = Lib_IntVector_Intrinsics_vec128_shift_left(lo1, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + lo3 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 lo31 = Lib_IntVector_Intrinsics_vec128_xor(lo3, lo20); + Lib_IntVector_Intrinsics_vec128 + hi1 = Lib_IntVector_Intrinsics_vec128_shift_right64(hi0, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + hi11 = Lib_IntVector_Intrinsics_vec128_shift_left(hi1, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + hi2 = Lib_IntVector_Intrinsics_vec128_shift_left64(hi0, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 hi21 = Lib_IntVector_Intrinsics_vec128_xor(hi2, hi11); + Lib_IntVector_Intrinsics_vec128 + lo11 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo12 = Lib_IntVector_Intrinsics_vec128_shift_right(lo11, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 hi22 = Lib_IntVector_Intrinsics_vec128_xor(hi21, lo12); + Lib_IntVector_Intrinsics_vec128 lo4 = lo31; + Lib_IntVector_Intrinsics_vec128 hi3 = hi22; + Lib_IntVector_Intrinsics_vec128 + lo13 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)63U); + Lib_IntVector_Intrinsics_vec128 + lo21 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)62U); + Lib_IntVector_Intrinsics_vec128 + lo32 = Lib_IntVector_Intrinsics_vec128_shift_left64(lo4, (uint32_t)57U); + Lib_IntVector_Intrinsics_vec128 lo14 = Lib_IntVector_Intrinsics_vec128_xor(lo13, lo21); + Lib_IntVector_Intrinsics_vec128 lo15 = Lib_IntVector_Intrinsics_vec128_xor(lo14, lo32); + Lib_IntVector_Intrinsics_vec128 + lo22 = Lib_IntVector_Intrinsics_vec128_shift_right(lo15, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 + lo33 = Lib_IntVector_Intrinsics_vec128_shift_left(lo15, (uint32_t)64U); + Lib_IntVector_Intrinsics_vec128 lo50 = Lib_IntVector_Intrinsics_vec128_xor(lo4, lo33); + Lib_IntVector_Intrinsics_vec128 lo_ = lo22; + Lib_IntVector_Intrinsics_vec128 + lo16 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo50, (uint32_t)1U); + Lib_IntVector_Intrinsics_vec128 + lo23 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo50, (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 + lo34 = Lib_IntVector_Intrinsics_vec128_shift_right64(lo50, (uint32_t)7U); + Lib_IntVector_Intrinsics_vec128 lo17 = Lib_IntVector_Intrinsics_vec128_xor(lo16, lo23); + Lib_IntVector_Intrinsics_vec128 lo18 = Lib_IntVector_Intrinsics_vec128_xor(lo17, lo34); + Lib_IntVector_Intrinsics_vec128 lo19 = Lib_IntVector_Intrinsics_vec128_xor(lo18, lo_); + Lib_IntVector_Intrinsics_vec128 lo60 = Lib_IntVector_Intrinsics_vec128_xor(lo50, lo19); + Lib_IntVector_Intrinsics_vec128 lo70 = Lib_IntVector_Intrinsics_vec128_xor(lo60, hi3); + Lib_IntVector_Intrinsics_vec128 lo110 = lo70; + acc[0U] = lo110; +} + +void Hacl_Gf128_NI_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key) +{ + Lib_IntVector_Intrinsics_vec128 *acc = ctx; + Lib_IntVector_Intrinsics_vec128 *pre = ctx + (uint32_t)1U; + acc[0U] = Lib_IntVector_Intrinsics_vec128_zero; + load_precompute_r(pre, key); +} + +void +Hacl_Gf128_NI_gcm_update_blocks( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *text +) +{ + Lib_IntVector_Intrinsics_vec128 *acc = ctx; + Lib_IntVector_Intrinsics_vec128 *pre = ctx + (uint32_t)1U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 f[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *b4 = f; + uint32_t nb = len0 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t0 + i * (uint32_t)64U; + b4[0U] = Lib_IntVector_Intrinsics_vec128_load_be(tb); + b4[1U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)16U); + b4[2U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)32U); + b4[3U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)48U); + b4[0U] = Lib_IntVector_Intrinsics_vec128_xor(acc[0U], b4[0U]); + normalize4(acc, b4, pre); + } + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + Lib_IntVector_Intrinsics_vec128 *r1 = pre + (uint32_t)3U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 elem = Lib_IntVector_Intrinsics_vec128_zero; + elem = Lib_IntVector_Intrinsics_vec128_load_be(tb); + fadd0(acc, &elem); + fmul0(acc, r1); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 elem = Lib_IntVector_Intrinsics_vec128_zero; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + elem = Lib_IntVector_Intrinsics_vec128_load_be(b); + fadd0(acc, &elem); + fmul0(acc, r1); + return; + } +} + +void +(*Hacl_Gf128_NI_gcm_update_padded)( + Lib_IntVector_Intrinsics_vec128 *x0, + uint32_t x1, + uint8_t *x2 +) = Hacl_Gf128_NI_gcm_update_blocks; + +void Hacl_Gf128_NI_gcm_emit(uint8_t *tag, Lib_IntVector_Intrinsics_vec128 *ctx) +{ + Lib_IntVector_Intrinsics_vec128 *acc = ctx; + Lib_IntVector_Intrinsics_vec128_store_be(tag, acc[0U]); +} + +void Hacl_Gf128_NI_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key) +{ + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 ctx[5U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *acc = ctx; + Lib_IntVector_Intrinsics_vec128 *pre0 = ctx + (uint32_t)1U; + acc[0U] = Lib_IntVector_Intrinsics_vec128_zero; + load_precompute_r(pre0, key); + Lib_IntVector_Intrinsics_vec128 *acc0 = ctx; + Lib_IntVector_Intrinsics_vec128 *pre = ctx + (uint32_t)1U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 f[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *b4 = f; + uint32_t nb = len0 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t0 + i * (uint32_t)64U; + b4[0U] = Lib_IntVector_Intrinsics_vec128_load_be(tb); + b4[1U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)16U); + b4[2U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)32U); + b4[3U] = Lib_IntVector_Intrinsics_vec128_load_be(tb + (uint32_t)48U); + b4[0U] = Lib_IntVector_Intrinsics_vec128_xor(acc0[0U], b4[0U]); + normalize4(acc0, b4, pre); + } + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + Lib_IntVector_Intrinsics_vec128 *r1 = pre + (uint32_t)3U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 elem = Lib_IntVector_Intrinsics_vec128_zero; + elem = Lib_IntVector_Intrinsics_vec128_load_be(tb); + fadd0(acc0, &elem); + fmul0(acc0, r1); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 elem = Lib_IntVector_Intrinsics_vec128_zero; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + elem = Lib_IntVector_Intrinsics_vec128_load_be(b); + fadd0(acc0, &elem); + fmul0(acc0, r1); + } + Lib_IntVector_Intrinsics_vec128 *acc1 = ctx; + Lib_IntVector_Intrinsics_vec128_store_be(tag, acc1[0U]); +} + diff --git a/src/msvc/Hacl_Gf128_PreComp.c b/src/msvc/Hacl_Gf128_PreComp.c new file mode 100644 index 00000000..fa12b870 --- /dev/null +++ b/src/msvc/Hacl_Gf128_PreComp.c @@ -0,0 +1,461 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_Gf128_PreComp.h" + +void Hacl_Impl_Gf128_FieldPreComp_fmul(uint64_t *x, uint64_t *y) +{ + uint64_t res[2U] = { 0U }; + uint64_t y_[2U] = { 0U }; + y_[0U] = y[0U]; + y_[1U] = y[1U]; + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + uint64_t m = (uint64_t)0U - (x[1U] >> ((uint32_t)63U - i) & (uint64_t)1U); + res[0U] = res[0U] ^ (y_[0U] & m); + res[1U] = res[1U] ^ (y_[1U] & m); + uint64_t m0 = (uint64_t)0U - (y_[0U] & (uint64_t)1U); + y_[0U] = y_[0U] >> (uint32_t)1U | y_[1U] << (uint32_t)63U; + y_[1U] = y_[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m0); + } + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + uint64_t m = (uint64_t)0U - (x[0U] >> ((uint32_t)63U - i) & (uint64_t)1U); + res[0U] = res[0U] ^ (y_[0U] & m); + res[1U] = res[1U] ^ (y_[1U] & m); + uint64_t m0 = (uint64_t)0U - (y_[0U] & (uint64_t)1U); + y_[0U] = y_[0U] >> (uint32_t)1U | y_[1U] << (uint32_t)63U; + y_[1U] = y_[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m0); + } + x[0U] = res[0U]; + x[1U] = res[1U]; +} + +static inline void prepare(uint64_t *pre, uint64_t *r) +{ + memset(pre, 0U, (uint32_t)256U * sizeof (uint64_t)); + uint64_t sh[2U] = { 0U }; + sh[0U] = r[0U]; + sh[1U] = r[1U]; + uint64_t *pre1 = pre; + uint64_t *pre2 = pre + (uint32_t)128U; + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + memcpy(pre1 + (uint32_t)2U * i, sh, (uint32_t)2U * sizeof (uint64_t)); + uint64_t m = (uint64_t)0U - (sh[0U] & (uint64_t)1U); + sh[0U] = sh[0U] >> (uint32_t)1U | sh[1U] << (uint32_t)63U; + sh[1U] = sh[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m); + } + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + memcpy(pre2 + (uint32_t)2U * i, sh, (uint32_t)2U * sizeof (uint64_t)); + uint64_t m = (uint64_t)0U - (sh[0U] & (uint64_t)1U); + sh[0U] = sh[0U] >> (uint32_t)1U | sh[1U] << (uint32_t)63U; + sh[1U] = sh[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m); + } +} + +void Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(uint64_t *pre, uint8_t *key) +{ + uint64_t *r4321 = pre; + uint64_t *r1 = r4321 + (uint32_t)6U; + uint64_t *r2 = r4321 + (uint32_t)4U; + uint64_t *r3 = r4321 + (uint32_t)2U; + uint64_t *r4 = r4321; + uint64_t *table2 = pre + (uint32_t)8U; + uint64_t u = load64_be(key); + r1[1U] = u; + uint64_t u0 = load64_be(key + (uint32_t)8U); + r1[0U] = u0; + r4[0U] = r1[0U]; + r4[1U] = r1[1U]; + r3[0U] = r1[0U]; + r3[1U] = r1[1U]; + r2[0U] = r1[0U]; + r2[1U] = r1[1U]; + Hacl_Impl_Gf128_FieldPreComp_fmul(r2, r1); + Hacl_Impl_Gf128_FieldPreComp_fmul(r3, r2); + Hacl_Impl_Gf128_FieldPreComp_fmul(r4, r3); + prepare(table2, r4); +} + +static inline void fmul_pre(uint64_t *x, uint64_t *pre) +{ + uint64_t *tab = pre + (uint32_t)8U; + uint64_t tmp[2U] = { 0U }; + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + uint64_t *uu____0 = tab + (uint32_t)2U * i; + uint64_t m = (uint64_t)0U - (x[1U] >> ((uint32_t)63U - i) & (uint64_t)1U); + tmp[0U] = tmp[0U] ^ (uu____0[0U] & m); + tmp[1U] = tmp[1U] ^ (uu____0[1U] & m); + } + for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) + { + uint64_t *uu____1 = tab + (uint32_t)128U + (uint32_t)2U * i; + uint64_t m = (uint64_t)0U - (x[0U] >> ((uint32_t)63U - i) & (uint64_t)1U); + tmp[0U] = tmp[0U] ^ (uu____1[0U] & m); + tmp[1U] = tmp[1U] ^ (uu____1[1U] & m); + } + x[0U] = tmp[0U]; + x[1U] = tmp[1U]; +} + +void Hacl_Impl_Gf128_FieldPreComp_fmul_r4(uint64_t *x, uint64_t *pre) +{ + fmul_pre(x, pre); + fmul_pre(x + (uint32_t)2U, pre); + fmul_pre(x + (uint32_t)4U, pre); + fmul_pre(x + (uint32_t)6U, pre); +} + +void Hacl_Impl_Gf128_FieldPreComp_normalize4(uint64_t *acc, uint64_t *x, uint64_t *pre) +{ + uint64_t *x1 = x; + uint64_t *x2 = x + (uint32_t)2U; + uint64_t *x3 = x + (uint32_t)4U; + uint64_t *x4 = x + (uint32_t)6U; + fmul_pre(x, pre); + Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)2U, pre + (uint32_t)2U); + Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)4U, pre + (uint32_t)4U); + Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)6U, pre + (uint32_t)6U); + acc[0U] = x1[0U]; + acc[1U] = x1[1U]; + acc[0U] = acc[0U] ^ x2[0U]; + acc[1U] = acc[1U] ^ x2[1U]; + acc[0U] = acc[0U] ^ x3[0U]; + acc[1U] = acc[1U] ^ x3[1U]; + acc[0U] = acc[0U] ^ x4[0U]; + acc[1U] = acc[1U] ^ x4[1U]; +} + +void Hacl_Gf128_PreComp_gcm_init(uint64_t *ctx, uint8_t *key) +{ + uint64_t *acc = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + acc[0U] = (uint64_t)0U; + acc[1U] = (uint64_t)0U; + Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(pre, key); +} + +void Hacl_Gf128_PreComp_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text) +{ + uint64_t *acc = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + uint64_t f0[8U] = { 0U }; + uint64_t *b4 = f0; + uint64_t f[8U] = { 0U }; + uint64_t *acc4 = f; + uint8_t *tb = t0; + memcpy(acc4, acc, (uint32_t)2U * sizeof (uint64_t)); + uint64_t *x00 = b4; + uint8_t *y00 = tb; + uint64_t *x10 = b4 + (uint32_t)2U; + uint8_t *y10 = tb + (uint32_t)16U; + uint64_t *x20 = b4 + (uint32_t)4U; + uint8_t *y20 = tb + (uint32_t)32U; + uint64_t *x30 = b4 + (uint32_t)6U; + uint8_t *y30 = tb + (uint32_t)48U; + uint64_t u0 = load64_be(y00); + x00[1U] = u0; + uint64_t u1 = load64_be(y00 + (uint32_t)8U); + x00[0U] = u1; + uint64_t u2 = load64_be(y10); + x10[1U] = u2; + uint64_t u3 = load64_be(y10 + (uint32_t)8U); + x10[0U] = u3; + uint64_t u4 = load64_be(y20); + x20[1U] = u4; + uint64_t u5 = load64_be(y20 + (uint32_t)8U); + x20[0U] = u5; + uint64_t u6 = load64_be(y30); + x30[1U] = u6; + uint64_t u7 = load64_be(y30 + (uint32_t)8U); + x30[0U] = u7; + uint64_t *x01 = acc4; + uint64_t *y01 = b4; + uint64_t *x11 = acc4 + (uint32_t)2U; + uint64_t *y11 = b4 + (uint32_t)2U; + uint64_t *x21 = acc4 + (uint32_t)4U; + uint64_t *y21 = b4 + (uint32_t)4U; + uint64_t *x31 = acc4 + (uint32_t)6U; + uint64_t *y31 = b4 + (uint32_t)6U; + x01[0U] = x01[0U] ^ y01[0U]; + x01[1U] = x01[1U] ^ y01[1U]; + x11[0U] = x11[0U] ^ y11[0U]; + x11[1U] = x11[1U] ^ y11[1U]; + x21[0U] = x21[0U] ^ y21[0U]; + x21[1U] = x21[1U] ^ y21[1U]; + x31[0U] = x31[0U] ^ y31[0U]; + x31[1U] = x31[1U] ^ y31[1U]; + uint32_t len1 = len0 - (uint32_t)64U; + uint8_t *text1 = t0 + (uint32_t)64U; + uint32_t nb = len1 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb1 = text1 + i * (uint32_t)64U; + uint64_t *x0 = b4; + uint8_t *y02 = tb1; + uint64_t *x12 = b4 + (uint32_t)2U; + uint8_t *y12 = tb1 + (uint32_t)16U; + uint64_t *x22 = b4 + (uint32_t)4U; + uint8_t *y22 = tb1 + (uint32_t)32U; + uint64_t *x32 = b4 + (uint32_t)6U; + uint8_t *y32 = tb1 + (uint32_t)48U; + uint64_t u = load64_be(y02); + x0[1U] = u; + uint64_t u8 = load64_be(y02 + (uint32_t)8U); + x0[0U] = u8; + uint64_t u9 = load64_be(y12); + x12[1U] = u9; + uint64_t u10 = load64_be(y12 + (uint32_t)8U); + x12[0U] = u10; + uint64_t u11 = load64_be(y22); + x22[1U] = u11; + uint64_t u12 = load64_be(y22 + (uint32_t)8U); + x22[0U] = u12; + uint64_t u13 = load64_be(y32); + x32[1U] = u13; + uint64_t u14 = load64_be(y32 + (uint32_t)8U); + x32[0U] = u14; + Hacl_Impl_Gf128_FieldPreComp_fmul_r4(acc4, pre); + uint64_t *x02 = acc4; + uint64_t *y0 = b4; + uint64_t *x1 = acc4 + (uint32_t)2U; + uint64_t *y1 = b4 + (uint32_t)2U; + uint64_t *x2 = acc4 + (uint32_t)4U; + uint64_t *y2 = b4 + (uint32_t)4U; + uint64_t *x3 = acc4 + (uint32_t)6U; + uint64_t *y3 = b4 + (uint32_t)6U; + x02[0U] = x02[0U] ^ y0[0U]; + x02[1U] = x02[1U] ^ y0[1U]; + x1[0U] = x1[0U] ^ y1[0U]; + x1[1U] = x1[1U] ^ y1[1U]; + x2[0U] = x2[0U] ^ y2[0U]; + x2[1U] = x2[1U] ^ y2[1U]; + x3[0U] = x3[0U] ^ y3[0U]; + x3[1U] = x3[1U] ^ y3[1U]; + } + Hacl_Impl_Gf128_FieldPreComp_normalize4(acc, acc4, pre); + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + uint64_t *r1 = pre + (uint32_t)6U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint64_t u = load64_be(tb); + elem[1U] = u; + uint64_t u0 = load64_be(tb + (uint32_t)8U); + elem[0U] = u0; + acc[0U] = acc[0U] ^ elem[0U]; + acc[1U] = acc[1U] ^ elem[1U]; + Hacl_Impl_Gf128_FieldPreComp_fmul(acc, r1); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + uint64_t u = load64_be(b); + elem[1U] = u; + uint64_t u0 = load64_be(b + (uint32_t)8U); + elem[0U] = u0; + acc[0U] = acc[0U] ^ elem[0U]; + acc[1U] = acc[1U] ^ elem[1U]; + Hacl_Impl_Gf128_FieldPreComp_fmul(acc, r1); + return; + } +} + +void +(*Hacl_Gf128_PreComp_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2) = + Hacl_Gf128_PreComp_gcm_update_blocks; + +void Hacl_Gf128_PreComp_gcm_emit(uint8_t *tag, uint64_t *ctx) +{ + uint64_t *acc = ctx; + uint64_t r0 = acc[1U]; + uint64_t r1 = acc[0U]; + store64_be(tag, r0); + store64_be(tag + (uint32_t)8U, r1); +} + +void Hacl_Gf128_PreComp_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key) +{ + uint64_t ctx[266U] = { 0U }; + uint64_t *acc = ctx; + uint64_t *pre0 = ctx + (uint32_t)2U; + acc[0U] = (uint64_t)0U; + acc[1U] = (uint64_t)0U; + Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(pre0, key); + uint64_t *acc0 = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + uint64_t f0[8U] = { 0U }; + uint64_t *b4 = f0; + uint64_t f[8U] = { 0U }; + uint64_t *acc4 = f; + uint8_t *tb = t0; + memcpy(acc4, acc0, (uint32_t)2U * sizeof (uint64_t)); + uint64_t *x00 = b4; + uint8_t *y00 = tb; + uint64_t *x10 = b4 + (uint32_t)2U; + uint8_t *y10 = tb + (uint32_t)16U; + uint64_t *x20 = b4 + (uint32_t)4U; + uint8_t *y20 = tb + (uint32_t)32U; + uint64_t *x30 = b4 + (uint32_t)6U; + uint8_t *y30 = tb + (uint32_t)48U; + uint64_t u0 = load64_be(y00); + x00[1U] = u0; + uint64_t u1 = load64_be(y00 + (uint32_t)8U); + x00[0U] = u1; + uint64_t u2 = load64_be(y10); + x10[1U] = u2; + uint64_t u3 = load64_be(y10 + (uint32_t)8U); + x10[0U] = u3; + uint64_t u4 = load64_be(y20); + x20[1U] = u4; + uint64_t u5 = load64_be(y20 + (uint32_t)8U); + x20[0U] = u5; + uint64_t u6 = load64_be(y30); + x30[1U] = u6; + uint64_t u7 = load64_be(y30 + (uint32_t)8U); + x30[0U] = u7; + uint64_t *x01 = acc4; + uint64_t *y01 = b4; + uint64_t *x11 = acc4 + (uint32_t)2U; + uint64_t *y11 = b4 + (uint32_t)2U; + uint64_t *x21 = acc4 + (uint32_t)4U; + uint64_t *y21 = b4 + (uint32_t)4U; + uint64_t *x31 = acc4 + (uint32_t)6U; + uint64_t *y31 = b4 + (uint32_t)6U; + x01[0U] = x01[0U] ^ y01[0U]; + x01[1U] = x01[1U] ^ y01[1U]; + x11[0U] = x11[0U] ^ y11[0U]; + x11[1U] = x11[1U] ^ y11[1U]; + x21[0U] = x21[0U] ^ y21[0U]; + x21[1U] = x21[1U] ^ y21[1U]; + x31[0U] = x31[0U] ^ y31[0U]; + x31[1U] = x31[1U] ^ y31[1U]; + uint32_t len1 = len0 - (uint32_t)64U; + uint8_t *text1 = t0 + (uint32_t)64U; + uint32_t nb = len1 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb1 = text1 + i * (uint32_t)64U; + uint64_t *x0 = b4; + uint8_t *y02 = tb1; + uint64_t *x12 = b4 + (uint32_t)2U; + uint8_t *y12 = tb1 + (uint32_t)16U; + uint64_t *x22 = b4 + (uint32_t)4U; + uint8_t *y22 = tb1 + (uint32_t)32U; + uint64_t *x32 = b4 + (uint32_t)6U; + uint8_t *y32 = tb1 + (uint32_t)48U; + uint64_t u = load64_be(y02); + x0[1U] = u; + uint64_t u8 = load64_be(y02 + (uint32_t)8U); + x0[0U] = u8; + uint64_t u9 = load64_be(y12); + x12[1U] = u9; + uint64_t u10 = load64_be(y12 + (uint32_t)8U); + x12[0U] = u10; + uint64_t u11 = load64_be(y22); + x22[1U] = u11; + uint64_t u12 = load64_be(y22 + (uint32_t)8U); + x22[0U] = u12; + uint64_t u13 = load64_be(y32); + x32[1U] = u13; + uint64_t u14 = load64_be(y32 + (uint32_t)8U); + x32[0U] = u14; + Hacl_Impl_Gf128_FieldPreComp_fmul_r4(acc4, pre); + uint64_t *x02 = acc4; + uint64_t *y0 = b4; + uint64_t *x1 = acc4 + (uint32_t)2U; + uint64_t *y1 = b4 + (uint32_t)2U; + uint64_t *x2 = acc4 + (uint32_t)4U; + uint64_t *y2 = b4 + (uint32_t)4U; + uint64_t *x3 = acc4 + (uint32_t)6U; + uint64_t *y3 = b4 + (uint32_t)6U; + x02[0U] = x02[0U] ^ y0[0U]; + x02[1U] = x02[1U] ^ y0[1U]; + x1[0U] = x1[0U] ^ y1[0U]; + x1[1U] = x1[1U] ^ y1[1U]; + x2[0U] = x2[0U] ^ y2[0U]; + x2[1U] = x2[1U] ^ y2[1U]; + x3[0U] = x3[0U] ^ y3[0U]; + x3[1U] = x3[1U] ^ y3[1U]; + } + Hacl_Impl_Gf128_FieldPreComp_normalize4(acc0, acc4, pre); + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + uint64_t *r10 = pre + (uint32_t)6U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint64_t u = load64_be(tb); + elem[1U] = u; + uint64_t u0 = load64_be(tb + (uint32_t)8U); + elem[0U] = u0; + acc0[0U] = acc0[0U] ^ elem[0U]; + acc0[1U] = acc0[1U] ^ elem[1U]; + Hacl_Impl_Gf128_FieldPreComp_fmul(acc0, r10); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + uint64_t u = load64_be(b); + elem[1U] = u; + uint64_t u0 = load64_be(b + (uint32_t)8U); + elem[0U] = u0; + acc0[0U] = acc0[0U] ^ elem[0U]; + acc0[1U] = acc0[1U] ^ elem[1U]; + Hacl_Impl_Gf128_FieldPreComp_fmul(acc0, r10); + } + uint64_t *acc1 = ctx; + uint64_t r0 = acc1[1U]; + uint64_t r1 = acc1[0U]; + store64_be(tag, r0); + store64_be(tag + (uint32_t)8U, r1); +} + diff --git a/src/msvc/Hacl_Lib.c b/src/msvc/Hacl_Lib.c new file mode 100644 index 00000000..5be84b2b --- /dev/null +++ b/src/msvc/Hacl_Lib.c @@ -0,0 +1,193 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "internal/Hacl_Lib.h" + +static Lib_Transposition64x8_uint64x2 transpose_aux_aux32(uint64_t a, uint64_t b) +{ + uint64_t m = (uint64_t)18446744069414584320U; + return + ( + (Lib_Transposition64x8_uint64x2){ + .fst = (a & ~m) ^ (b << (uint32_t)32U & m), + .snd = (a >> (uint32_t)32U & ~m) ^ (b & m) + } + ); +} + +static Lib_Transposition64x8_uint64x2 transpose_aux_aux16(uint64_t a, uint64_t b) +{ + uint64_t m = (uint64_t)18446462603027742720U; + return + ( + (Lib_Transposition64x8_uint64x2){ + .fst = (a & ~m) ^ (b << (uint32_t)16U & m), + .snd = (a >> (uint32_t)16U & ~m) ^ (b & m) + } + ); +} + +static Lib_Transposition64x8_uint64x2 transpose_aux_aux8(uint64_t a, uint64_t b) +{ + uint64_t m = (uint64_t)18374966859414961920U; + return + ( + (Lib_Transposition64x8_uint64x2){ + .fst = (a & ~m) ^ (b << (uint32_t)8U & m), + .snd = (a >> (uint32_t)8U & ~m) ^ (b & m) + } + ); +} + +static Lib_Transposition64x8_uint64x8 transpose_aux32(Lib_Transposition64x8_uint64x8 x) +{ + uint64_t x7 = x.snd.snd.snd; + uint64_t x6 = x.snd.snd.fst; + uint64_t x5 = x.snd.fst.snd; + uint64_t x4 = x.snd.fst.fst; + uint64_t x3 = x.fst.snd.snd; + uint64_t x2 = x.fst.snd.fst; + uint64_t x1 = x.fst.fst.snd; + uint64_t x0 = x.fst.fst.fst; + Lib_Transposition64x8_uint64x2 scrut0 = transpose_aux_aux32(x0, x4); + uint64_t y0 = scrut0.fst; + uint64_t y4 = scrut0.snd; + Lib_Transposition64x8_uint64x2 scrut1 = transpose_aux_aux32(x1, x5); + uint64_t y1 = scrut1.fst; + uint64_t y5 = scrut1.snd; + Lib_Transposition64x8_uint64x2 scrut2 = transpose_aux_aux32(x2, x6); + uint64_t y2 = scrut2.fst; + uint64_t y6 = scrut2.snd; + Lib_Transposition64x8_uint64x2 scrut = transpose_aux_aux32(x3, x7); + uint64_t y3 = scrut.fst; + uint64_t y7 = scrut.snd; + return + ( + (Lib_Transposition64x8_uint64x8){ + .fst = { .fst = { .fst = y0, .snd = y1 }, .snd = { .fst = y2, .snd = y3 } }, + .snd = { .fst = { .fst = y4, .snd = y5 }, .snd = { .fst = y6, .snd = y7 } } + } + ); +} + +static Lib_Transposition64x8_uint64x4 transpose_aux16(Lib_Transposition64x8_uint64x4 x) +{ + uint64_t x3 = x.snd.snd; + uint64_t x2 = x.snd.fst; + uint64_t x1 = x.fst.snd; + uint64_t x0 = x.fst.fst; + Lib_Transposition64x8_uint64x2 scrut0 = transpose_aux_aux16(x0, x2); + uint64_t y0 = scrut0.fst; + uint64_t y2 = scrut0.snd; + Lib_Transposition64x8_uint64x2 scrut = transpose_aux_aux16(x1, x3); + uint64_t y1 = scrut.fst; + uint64_t y3 = scrut.snd; + return + ( + (Lib_Transposition64x8_uint64x4){ + .fst = { .fst = y0, .snd = y1 }, + .snd = { .fst = y2, .snd = y3 } + } + ); +} + +static Lib_Transposition64x8_uint64x2 transpose_aux8(Lib_Transposition64x8_uint64x2 x) +{ + uint64_t x0 = x.fst; + uint64_t x1 = x.snd; + Lib_Transposition64x8_uint64x2 scrut = transpose_aux_aux8(x0, x1); + uint64_t y0 = scrut.fst; + uint64_t y1 = scrut.snd; + return ((Lib_Transposition64x8_uint64x2){ .fst = y0, .snd = y1 }); +} + +uint64_t Lib_Transposition64x8_transpose_bits64(uint64_t x) +{ + uint64_t m0 = (uint64_t)0x8040201008040201U; + uint64_t m1 = (uint64_t)0x4020100804020100U; + uint64_t m2 = (uint64_t)0x2010080402010000U; + uint64_t m3 = (uint64_t)0x1008040201000000U; + uint64_t m4 = (uint64_t)0x0804020100000000U; + uint64_t m5 = (uint64_t)0x0402010000000000U; + uint64_t m6 = (uint64_t)0x0201000000000000U; + uint64_t m7 = (uint64_t)0x0100000000000000U; + uint64_t y0 = x & m0; + uint64_t y1 = y0 | (x & m1) >> (uint32_t)7U; + uint64_t y2 = y1 | (x & m2) >> (uint32_t)14U; + uint64_t y3 = y2 | (x & m3) >> (uint32_t)21U; + uint64_t y4 = y3 | (x & m4) >> (uint32_t)28U; + uint64_t y5 = y4 | (x & m5) >> (uint32_t)35U; + uint64_t y6 = y5 | (x & m6) >> (uint32_t)42U; + uint64_t y7 = y6 | (x & m7) >> (uint32_t)49U; + uint64_t y8 = y7 | (x << (uint32_t)7U & m1); + uint64_t y9 = y8 | (x << (uint32_t)14U & m2); + uint64_t y10 = y9 | (x << (uint32_t)21U & m3); + uint64_t y11 = y10 | (x << (uint32_t)28U & m4); + uint64_t y12 = y11 | (x << (uint32_t)35U & m5); + uint64_t y13 = y12 | (x << (uint32_t)42U & m6); + return y13 | (x << (uint32_t)49U & m7); +} + +Lib_Transposition64x8_uint64x8 +Lib_Transposition64x8_transpose_bits64x8(Lib_Transposition64x8_uint64x8 a) +{ + Lib_Transposition64x8_uint64x8 scrut0 = transpose_aux32(a); + Lib_Transposition64x8_uint64x4 b0 = scrut0.fst; + Lib_Transposition64x8_uint64x4 b1 = scrut0.snd; + Lib_Transposition64x8_uint64x4 scrut1 = transpose_aux16(b0); + Lib_Transposition64x8_uint64x2 c0 = scrut1.fst; + Lib_Transposition64x8_uint64x2 c1 = scrut1.snd; + Lib_Transposition64x8_uint64x4 scrut2 = transpose_aux16(b1); + Lib_Transposition64x8_uint64x2 c2 = scrut2.fst; + Lib_Transposition64x8_uint64x2 c3 = scrut2.snd; + Lib_Transposition64x8_uint64x2 scrut3 = transpose_aux8(c0); + uint64_t d0 = scrut3.fst; + uint64_t d1 = scrut3.snd; + Lib_Transposition64x8_uint64x2 scrut4 = transpose_aux8(c1); + uint64_t d2 = scrut4.fst; + uint64_t d3 = scrut4.snd; + Lib_Transposition64x8_uint64x2 scrut5 = transpose_aux8(c2); + uint64_t d4 = scrut5.fst; + uint64_t d5 = scrut5.snd; + Lib_Transposition64x8_uint64x2 scrut = transpose_aux8(c3); + uint64_t d6 = scrut.fst; + uint64_t d7 = scrut.snd; + uint64_t e0 = Lib_Transposition64x8_transpose_bits64(d0); + uint64_t e1 = Lib_Transposition64x8_transpose_bits64(d1); + uint64_t e2 = Lib_Transposition64x8_transpose_bits64(d2); + uint64_t e3 = Lib_Transposition64x8_transpose_bits64(d3); + uint64_t e4 = Lib_Transposition64x8_transpose_bits64(d4); + uint64_t e5 = Lib_Transposition64x8_transpose_bits64(d5); + uint64_t e6 = Lib_Transposition64x8_transpose_bits64(d6); + uint64_t e7 = Lib_Transposition64x8_transpose_bits64(d7); + return + ( + (Lib_Transposition64x8_uint64x8){ + .fst = { .fst = { .fst = e0, .snd = e1 }, .snd = { .fst = e2, .snd = e3 } }, + .snd = { .fst = { .fst = e4, .snd = e5 }, .snd = { .fst = e6, .snd = e7 } } + } + ); +} + From bcf59621cadc503030346f16cd3a93a897f84501 Mon Sep 17 00:00:00 2001 From: mamonet <66893036+mamonet@users.noreply.github.com> Date: Tue, 18 Jul 2023 12:50:15 +0300 Subject: [PATCH 03/10] Add internal/Hacl_Spec.h to MSVC --- include/msvc/internal/Hacl_Spec.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/msvc/internal/Hacl_Spec.h b/include/msvc/internal/Hacl_Spec.h index fee56d84..e3ddd18f 100644 --- a/include/msvc/internal/Hacl_Spec.h +++ b/include/msvc/internal/Hacl_Spec.h @@ -40,6 +40,10 @@ extern "C" { #define Spec_Cipher_Expansion_Hacl_CHACHA20 0 #define Spec_Cipher_Expansion_Vale_AES128 1 #define Spec_Cipher_Expansion_Vale_AES256 2 +#define Spec_Cipher_Expansion_AESNI_PCLMUL_AES128 3 +#define Spec_Cipher_Expansion_AESNI_PCLMUL_AES256 4 +#define Spec_Cipher_Expansion_M32_AES128 5 +#define Spec_Cipher_Expansion_M32_AES256 6 typedef uint8_t Spec_Cipher_Expansion_impl; From ff141587f41b12893cb985a8336f5ca8f4633ffc Mon Sep 17 00:00:00 2001 From: mamonet <66893036+mamonet@users.noreply.github.com> Date: Thu, 20 Jul 2023 08:52:32 +0300 Subject: [PATCH 04/10] Fix ocaml and rust builds --- opam.sh | 1 + rust/hacl-sys/build.rs | 1 + 2 files changed, 2 insertions(+) diff --git a/opam.sh b/opam.sh index ba03cf31..505596a3 100755 --- a/opam.sh +++ b/opam.sh @@ -14,6 +14,7 @@ cp include/* opam/hacl-star-raw/include/ | true cp -r include/internal opam/hacl-star-raw/include/internal cp -r vale opam/hacl-star-raw cp -r karamel opam/hacl-star-raw +cp -r cpu-features opam/hacl-star-raw cp CMakeLists.txt opam/hacl-star-raw diff --git a/rust/hacl-sys/build.rs b/rust/hacl-sys/build.rs index 6f953c1e..2aec025a 100644 --- a/rust/hacl-sys/build.rs +++ b/rust/hacl-sys/build.rs @@ -159,6 +159,7 @@ fn copy_hacl_to_out(out_dir: &Path) { copy(&local_c_path.join("vale"), &out_dir, &options).unwrap(); copy(&local_c_path.join("karamel"), &out_dir, &options).unwrap(); copy(&local_c_path.join("include"), &out_dir, &options).unwrap(); + copy(&local_c_path.join("cpu-features"), &out_dir, &options).unwrap(); let options = file::CopyOptions::new().overwrite(true); file::copy( From d67a8b40ed9b9a70ba79af239369c1b5ff1e9422 Mon Sep 17 00:00:00 2001 From: mamonet <66893036+mamonet@users.noreply.github.com> Date: Thu, 20 Jul 2023 10:25:01 +0300 Subject: [PATCH 05/10] Improve procedure of AESNI_PCLMUL compile in CMakeLists.txt --- CMakeLists.txt | 93 +++++++++++++++------------------ cpu-features/src/cpu-features.c | 1 + 2 files changed, 44 insertions(+), 50 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 03ee9969..21bd4886 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -84,6 +84,42 @@ set(CMAKE_C_STANDARD_REQUIRED True) # Read config from file include(build/config.cmake) +# x64 +# Set the architecture here. These come from the CMAKE_TOOLCHAIN_FILE +if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64") + message(STATUS "Detected an x64 architecture") + set(ARCHITECTURE intel) + set(HACL_TARGET_ARCHITECTURE ${HACL_ARCHITECTURE_X64}) + +# x86 +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i586|i686|i86pc|ia32|x86") + message(STATUS "Detected an x86 architecture") + set(ARCHITECTURE intel) + set(HACL_TARGET_ARCHITECTURE ${HACL_ARCHITECTURE_X86}) + +# arm64 +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|arm64v8") + message(STATUS "Detected an arm64 architecture") + set(ARCHITECTURE arm) + set(HACL_TARGET_ARCHITECTURE ${HACL_ARCHITECTURE_ARM64}) + +# arm32 +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armel|armhf|armv7|arm32v7") + message(STATUS "Detected an arm32 architecture") + set(ARCHITECTURE arm) + set(HACL_TARGET_ARCHITECTURE ${HACL_ARCHITECTURE_ARM32}) + +# s390x +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") + message(STATUS "Detected an s390x (systemz) architecture") + set(ARCHITECTURE arm) + set(HACL_TARGET_ARCHITECTURE ${HACL_ARCHITECTURE_SYSTEMZ}) + +# unsupported architecture +else() + message(FATAL_ERROR "Unsupported architecture ${CMAKE_SYSTEM_PROCESSOR}") +endif() + # Configure different targets # TODO: Set flags for MSVC if(NOT MSVC) @@ -197,7 +233,7 @@ if(TOOLCHAIN_CAN_COMPILE_VEC128) add_library(hacl_vec128 OBJECT ${SOURCES_vec128}) target_include_directories(hacl_vec128 PRIVATE) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i586|i686|i86pc|ia32|x86_64|amd64|AMD64") + if(ARCHITECTURE MATCHES intel) if(MSVC) # Nothing to do here. MSVC has it covered else() @@ -208,7 +244,7 @@ if(TOOLCHAIN_CAN_COMPILE_VEC128) -msse4.2 ) endif(MSVC) - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|arm64v8") + elseif(HACL_TARGET_ARCHITECTURE MATCHES ${HACL_ARCHITECTURE_ARM64}) target_compile_options(hacl_vec128 PRIVATE -march=armv8-a+simd ) @@ -252,7 +288,7 @@ if(TOOLCHAIN_CAN_COMPILE_VEC256) target_include_directories(hacl_vec256 PRIVATE) # We really should only get here on x86 architectures. But let's make sure. - if(CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i586|i686|i86pc|ia32|x86|x86_64|amd64|AMD64") + if(ARCHITECTURE MATCHES intel) if(MSVC) target_compile_options(hacl_vec256 PRIVATE /arch:AVX @@ -285,10 +321,7 @@ if(TOOLCHAIN_CAN_COMPILE_VALE) set(HACL_CAN_COMPILE_VALE 1) endif() -if(TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL) - add_compile_options( - -DHACL_CAN_COMPILE_VEC128 - ) +if(TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL AND TOOLCHAIN_CAN_COMPILE_VEC128 AND ((ARCHITECTURE MATCHES intel AND TOOLCHAIN_CAN_COMPILE_VEC256) OR HACL_TARGET_ARCHITECTURE MATCHES ${HACL_ARCHITECTURE_ARM64})) set(HACL_CAN_COMPILE_AESNI_PCLMUL 1) # # We make separate compilation units (objects) for each hardware feature @@ -298,11 +331,7 @@ if(TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL) set(HACL_AESNI_PCLMUL_O ON) add_library(hacl_aesni_pclmul OBJECT ${SOURCES_aesni_pclmul}) target_include_directories(hacl_aesni_pclmul PRIVATE) - if(CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i586|i686|i86pc|ia32|x86_64|amd64|AMD64") - add_compile_options( - -DHACL_CAN_COMPILE_VEC256 - ) - + if(ARCHITECTURE MATCHES intel) if(MSVC) # Nothing to do here. MSVC has it covered else() @@ -315,7 +344,7 @@ if(TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL) -mpclmul ) endif(MSVC) - elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|arm64v8") + elseif(HACL_TARGET_ARCHITECTURE MATCHES ${HACL_ARCHITECTURE_ARM64}) target_compile_options(hacl_aesni_pclmul PRIVATE -march=armv8-a+crypto ) @@ -333,42 +362,6 @@ if(TOOLCHAIN_CAN_COMPILE_INTRINSICS) set(HACL_CAN_COMPILE_INTRINSICS 1) endif() -# x64 -# Set the architecture here. These come from the CMAKE_TOOLCHAIN_FILE -if(CMAKE_SYSTEM_PROCESSOR MATCHES "x86_64|amd64|AMD64") - message(STATUS "Detected an x64 architecture") - set(ARCHITECTURE intel) - set(HACL_TARGET_ARCHITECTURE ${HACL_ARCHITECTURE_X64}) - -# x86 -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "i386|i586|i686|i86pc|ia32|x86") - message(STATUS "Detected an x86 architecture") - set(ARCHITECTURE intel) - set(HACL_TARGET_ARCHITECTURE ${HACL_ARCHITECTURE_X86}) - -# arm64 -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "aarch64|arm64|arm64v8") - message(STATUS "Detected an arm64 architecture") - set(ARCHITECTURE arm) - set(HACL_TARGET_ARCHITECTURE ${HACL_ARCHITECTURE_ARM64}) - -# arm32 -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "armel|armhf|armv7|arm32v7") - message(STATUS "Detected an arm32 architecture") - set(ARCHITECTURE arm) - set(HACL_TARGET_ARCHITECTURE ${HACL_ARCHITECTURE_ARM32}) - -# s390x -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "s390x") - message(STATUS "Detected an s390x (systemz) architecture") - set(ARCHITECTURE arm) - set(HACL_TARGET_ARCHITECTURE ${HACL_ARCHITECTURE_SYSTEMZ}) - -# unsupported architecture -else() - message(FATAL_ERROR "Unsupported architecture ${CMAKE_SYSTEM_PROCESSOR}") -endif() - # Write configuration configure_file(config/Config.h.in config.h) @@ -447,7 +440,7 @@ install(DIRECTORY vale/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/vale # # Install config.h install(FILES build/config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hacl) -# CPU feature detection for tests +# CPU feature detection add_library(hacl_cpu_features OBJECT ${PROJECT_SOURCE_DIR}/cpu-features/src/cpu-features.c) target_include_directories(hacl_cpu_features PUBLIC ${PROJECT_SOURCE_DIR}/cpu-features/include) add_dependencies(hacl hacl_cpu_features) diff --git a/cpu-features/src/cpu-features.c b/cpu-features/src/cpu-features.c index f22de709..367494fc 100644 --- a/cpu-features/src/cpu-features.c +++ b/cpu-features/src/cpu-features.c @@ -232,6 +232,7 @@ hacl_init_cpu_features() int64_t ret = 0; size_t size = sizeof(ret); + // Check for general support of Advanced SIMD instructions err = sysctlbyname("hw.optional.AdvSIMD", &ret, &size, NULL, 0); _asimd = (err == 0 && ret > 0) ? 1 : 0; From d93469ea8bd454f4bb93086badfaa47e2d8ea546 Mon Sep 17 00:00:00 2001 From: Maamoun TK Date: Sat, 5 Aug 2023 19:31:16 +0300 Subject: [PATCH 06/10] Update AES-GCM modules with GHASH CT64 integration --- CMakeLists.txt | 6 +- benchmarks/aesgcm.cc | 18 +- config/config.json | 24 +- config/default_config.cmake | 26 +- ...tSlice.h => Hacl_AES_128_CTR32_BitSlice.h} | 21 +- ...l_AES_128_NI.h => Hacl_AES_128_CTR32_NI.h} | 25 +- ..._128_GCM_M32.h => Hacl_AES_128_GCM_CT64.h} | 20 +- include/Hacl_AES_128_GCM_NI.h | 2 +- include/Hacl_AES_256_CTR32_BitSlice.h | 83 + include/Hacl_AES_256_CTR32_NI.h | 95 + ..._128_GCM_M32.h => Hacl_AES_256_GCM_CT64.h} | 20 +- include/Hacl_AES_256_GCM_NI.h | 75 + ...Hacl_Gf128_PreComp.h => Hacl_Gf128_CT64.h} | 16 +- .../Hacl_AES_128_CTR32_BitSlice.h} | 8 +- include/internal/Hacl_Spec.h | 4 +- ...tSlice.h => Hacl_AES_128_CTR32_BitSlice.h} | 21 +- .../Hacl_AES_128_CTR32_NI.h} | 25 +- include/msvc/Hacl_AES_128_GCM_CT64.h | 76 + include/msvc/Hacl_AES_128_GCM_NI.h | 2 +- include/msvc/Hacl_AES_256_CTR32_BitSlice.h | 83 + include/msvc/Hacl_AES_256_CTR32_NI.h | 95 + include/msvc/Hacl_AES_256_GCM_CT64.h | 76 + include/msvc/Hacl_AES_256_GCM_NI.h | 75 + .../Hacl_Gf128_CT64.h} | 16 +- .../internal/Hacl_AES_128_CTR32_BitSlice.h} | 8 +- include/msvc/internal/Hacl_Spec.h | 4 +- src/EverCrypt_AEAD.c | 785 ++++++- ...tSlice.c => Hacl_AES_128_CTR32_BitSlice.c} | 12 +- ...l_AES_128_NI.c => Hacl_AES_128_CTR32_NI.c} | 64 +- src/Hacl_AES_128_GCM_CT64.c | 210 ++ src/Hacl_AES_128_GCM_NI.c | 91 +- src/Hacl_AES_256_CTR32_BitSlice.c | 634 ++++++ src/Hacl_AES_256_CTR32_NI.c | 1433 +++++++++++++ ..._128_GCM_M32.c => Hacl_AES_256_GCM_CT64.c} | 108 +- src/Hacl_AES_256_GCM_NI.c | 182 ++ src/Hacl_Gf128_CT64.c | 1801 +++++++++++++++++ src/Hacl_Gf128_PreComp.c | 461 ----- src/msvc/EverCrypt_AEAD.c | 785 ++++++- ...tSlice.c => Hacl_AES_128_CTR32_BitSlice.c} | 12 +- ...l_AES_128_NI.c => Hacl_AES_128_CTR32_NI.c} | 64 +- src/msvc/Hacl_AES_128_GCM_CT64.c | 210 ++ src/msvc/Hacl_AES_128_GCM_NI.c | 91 +- src/msvc/Hacl_AES_256_CTR32_BitSlice.c | 634 ++++++ src/msvc/Hacl_AES_256_CTR32_NI.c | 1433 +++++++++++++ ..._128_GCM_M32.c => Hacl_AES_256_GCM_CT64.c} | 108 +- src/msvc/Hacl_AES_256_GCM_NI.c | 182 ++ src/msvc/Hacl_Gf128_CT64.c | 1801 +++++++++++++++++ src/msvc/Hacl_Gf128_PreComp.c | 461 ----- tests/aead.cc | 18 +- 49 files changed, 11004 insertions(+), 1500 deletions(-) rename include/{Hacl_AES_128_BitSlice.h => Hacl_AES_128_CTR32_BitSlice.h} (73%) rename include/{msvc/Hacl_AES_128_NI.h => Hacl_AES_128_CTR32_NI.h} (75%) rename include/{Hacl_AES_128_GCM_M32.h => Hacl_AES_128_GCM_CT64.h} (79%) create mode 100644 include/Hacl_AES_256_CTR32_BitSlice.h create mode 100644 include/Hacl_AES_256_CTR32_NI.h rename include/{msvc/Hacl_AES_128_GCM_M32.h => Hacl_AES_256_GCM_CT64.h} (79%) create mode 100644 include/Hacl_AES_256_GCM_NI.h rename include/{msvc/Hacl_Gf128_PreComp.h => Hacl_Gf128_CT64.h} (74%) rename include/{msvc/internal/Hacl_AES_128_BitSlice.h => internal/Hacl_AES_128_CTR32_BitSlice.h} (92%) rename include/msvc/{Hacl_AES_128_BitSlice.h => Hacl_AES_128_CTR32_BitSlice.h} (73%) rename include/{Hacl_AES_128_NI.h => msvc/Hacl_AES_128_CTR32_NI.h} (75%) create mode 100644 include/msvc/Hacl_AES_128_GCM_CT64.h create mode 100644 include/msvc/Hacl_AES_256_CTR32_BitSlice.h create mode 100644 include/msvc/Hacl_AES_256_CTR32_NI.h create mode 100644 include/msvc/Hacl_AES_256_GCM_CT64.h create mode 100644 include/msvc/Hacl_AES_256_GCM_NI.h rename include/{Hacl_Gf128_PreComp.h => msvc/Hacl_Gf128_CT64.h} (74%) rename include/{internal/Hacl_AES_128_BitSlice.h => msvc/internal/Hacl_AES_128_CTR32_BitSlice.h} (92%) rename src/{Hacl_AES_128_BitSlice.c => Hacl_AES_128_CTR32_BitSlice.c} (98%) rename src/{Hacl_AES_128_NI.c => Hacl_AES_128_CTR32_NI.c} (96%) create mode 100644 src/Hacl_AES_128_GCM_CT64.c create mode 100644 src/Hacl_AES_256_CTR32_BitSlice.c create mode 100644 src/Hacl_AES_256_CTR32_NI.c rename src/{Hacl_AES_128_GCM_M32.c => Hacl_AES_256_GCM_CT64.c} (64%) create mode 100644 src/Hacl_AES_256_GCM_NI.c create mode 100644 src/Hacl_Gf128_CT64.c delete mode 100644 src/Hacl_Gf128_PreComp.c rename src/msvc/{Hacl_AES_128_BitSlice.c => Hacl_AES_128_CTR32_BitSlice.c} (98%) rename src/msvc/{Hacl_AES_128_NI.c => Hacl_AES_128_CTR32_NI.c} (96%) create mode 100644 src/msvc/Hacl_AES_128_GCM_CT64.c create mode 100644 src/msvc/Hacl_AES_256_CTR32_BitSlice.c create mode 100644 src/msvc/Hacl_AES_256_CTR32_NI.c rename src/msvc/{Hacl_AES_128_GCM_M32.c => Hacl_AES_256_GCM_CT64.c} (64%) create mode 100644 src/msvc/Hacl_AES_256_GCM_NI.c create mode 100644 src/msvc/Hacl_Gf128_CT64.c delete mode 100644 src/msvc/Hacl_Gf128_PreComp.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 21bd4886..f0574b7f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -321,7 +321,9 @@ if(TOOLCHAIN_CAN_COMPILE_VALE) set(HACL_CAN_COMPILE_VALE 1) endif() -if(TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL AND TOOLCHAIN_CAN_COMPILE_VEC128 AND ((ARCHITECTURE MATCHES intel AND TOOLCHAIN_CAN_COMPILE_VEC256) OR HACL_TARGET_ARCHITECTURE MATCHES ${HACL_ARCHITECTURE_ARM64})) +if(TOOLCHAIN_CAN_COMPILE_AESNI_PCLMUL AND TOOLCHAIN_CAN_COMPILE_VEC128 AND + ((ARCHITECTURE MATCHES intel AND TOOLCHAIN_CAN_COMPILE_VEC256) OR + HACL_TARGET_ARCHITECTURE MATCHES ${HACL_ARCHITECTURE_ARM64})) set(HACL_CAN_COMPILE_AESNI_PCLMUL 1) # # We make separate compilation units (objects) for each hardware feature @@ -440,7 +442,7 @@ install(DIRECTORY vale/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/vale # # Install config.h install(FILES build/config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hacl) -# CPU feature detection +# CPU feature detection for tests add_library(hacl_cpu_features OBJECT ${PROJECT_SOURCE_DIR}/cpu-features/src/cpu-features.c) target_include_directories(hacl_cpu_features PUBLIC ${PROJECT_SOURCE_DIR}/cpu-features/include) add_dependencies(hacl hacl_cpu_features) diff --git a/benchmarks/aesgcm.cc b/benchmarks/aesgcm.cc index edc41310..891b5ad8 100644 --- a/benchmarks/aesgcm.cc +++ b/benchmarks/aesgcm.cc @@ -12,7 +12,7 @@ #ifdef HACL_CAN_COMPILE_AESNI_PCLMUL #include "Hacl_AES_128_GCM_NI.h" #endif -#include "Hacl_AES_128_GCM_M32.h" +#include "Hacl_AES_128_GCM_CT64.h" #include "EverCrypt_AEAD.h" #include "../third-party/bearssl/bearssl_block.h" #include "../third-party/bearssl/bearssl_hash.h" @@ -56,35 +56,35 @@ BENCHMARK(HACL_AES_128_GCM_NI_aad)->Setup(DoSetup)->Apply(Range); #endif static void -HACL_AES_128_GCM_M32_encrypt(benchmark::State& state) +HACL_AES_128_GCM_CT64_encrypt(benchmark::State& state) { bytes plaintext(state.range(0), 0x37); bytes ciphertext(state.range(0) + 16, 0); for (auto _ : state) { uint64_t *ctx = (uint64_t *)KRML_HOST_CALLOC((uint32_t)3168U, sizeof (uint8_t)); - Hacl_AES_128_GCM_M32_aes128_gcm_init(ctx, key.data()); - Hacl_AES_128_GCM_M32_aes128_gcm_encrypt(ctx, plaintext.size(), ciphertext.data(), plaintext.data(), 0, NULL, nonce.size(), nonce.data()); + Hacl_AES_128_GCM_CT64_aes128_gcm_init(ctx, key.data()); + Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt(ctx, plaintext.size(), ciphertext.data(), plaintext.data(), 0, NULL, nonce.size(), nonce.data()); KRML_HOST_FREE(ctx); } } -BENCHMARK(HACL_AES_128_GCM_M32_encrypt)->Setup(DoSetup)->Apply(Range); +BENCHMARK(HACL_AES_128_GCM_CT64_encrypt)->Setup(DoSetup)->Apply(Range); static void -HACL_AES_128_GCM_M32_aad(benchmark::State& state) +HACL_AES_128_GCM_CT64_aad(benchmark::State& state) { bytes aad(state.range(0), 0x37); for (auto _ : state) { uint64_t *ctx = (uint64_t *)KRML_HOST_CALLOC((uint32_t)3168U, sizeof (uint8_t)); - Hacl_AES_128_GCM_M32_aes128_gcm_init(ctx, key.data()); - Hacl_AES_128_GCM_M32_aes128_gcm_encrypt(ctx, 0, mac.data(), NULL, aad.size(), aad.data(), nonce.size(), nonce.data()); + Hacl_AES_128_GCM_CT64_aes128_gcm_init(ctx, key.data()); + Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt(ctx, 0, mac.data(), NULL, aad.size(), aad.data(), nonce.size(), nonce.data()); KRML_HOST_FREE(ctx); } } -BENCHMARK(HACL_AES_128_GCM_M32_aad)->Setup(DoSetup)->Apply(Range); +BENCHMARK(HACL_AES_128_GCM_CT64_aad)->Setup(DoSetup)->Apply(Range); static void EverCrypt_AES128_GCM_encrypt(benchmark::State& state) diff --git a/config/config.json b/config/config.json index e06d1780..d9d695eb 100644 --- a/config/config.json +++ b/config/config.json @@ -28,15 +28,23 @@ "features": "std" }, { - "file": "Hacl_Gf128_PreComp.c", + "file": "Hacl_Gf128_CT64.c", "features": "std" }, { - "file": "Hacl_AES_128_BitSlice.c", + "file": "Hacl_AES_128_CTR32_BitSlice.c", "features": "std" }, { - "file": "Hacl_AES_128_GCM_M32.c", + "file": "Hacl_AES_128_GCM_CT64.c", + "features": "std" + }, + { + "file": "Hacl_AES_256_CTR32_BitSlice.c", + "features": "std" + }, + { + "file": "Hacl_AES_256_GCM_CT64.c", "features": "std" }, { @@ -44,12 +52,20 @@ "features": "aesni_pclmul" }, { - "file": "Hacl_AES_128_NI.c", + "file": "Hacl_AES_128_CTR32_NI.c", "features": "aesni_pclmul" }, { "file": "Hacl_AES_128_GCM_NI.c", "features": "aesni_pclmul" + }, + { + "file": "Hacl_AES_256_CTR32_NI.c", + "features": "aesni_pclmul" + }, + { + "file": "Hacl_AES_256_GCM_NI.c", + "features": "aesni_pclmul" } ], "drbg": [ diff --git a/config/default_config.cmake b/config/default_config.cmake index 892bbad0..b586af84 100644 --- a/config/default_config.cmake +++ b/config/default_config.cmake @@ -54,9 +54,11 @@ set(SOURCES_std ${PROJECT_SOURCE_DIR}/src/EverCrypt_Poly1305.c ${PROJECT_SOURCE_DIR}/src/EverCrypt_AEAD.c ${PROJECT_SOURCE_DIR}/src/Hacl_Lib.c - ${PROJECT_SOURCE_DIR}/src/Hacl_Gf128_PreComp.c - ${PROJECT_SOURCE_DIR}/src/Hacl_AES_128_BitSlice.c - ${PROJECT_SOURCE_DIR}/src/Hacl_AES_128_GCM_M32.c + ${PROJECT_SOURCE_DIR}/src/Hacl_Gf128_CT64.c + ${PROJECT_SOURCE_DIR}/src/Hacl_AES_128_CTR32_BitSlice.c + ${PROJECT_SOURCE_DIR}/src/Hacl_AES_128_GCM_CT64.c + ${PROJECT_SOURCE_DIR}/src/Hacl_AES_256_CTR32_BitSlice.c + ${PROJECT_SOURCE_DIR}/src/Hacl_AES_256_GCM_CT64.c ) set(SOURCES_vec256 ${PROJECT_SOURCE_DIR}/src/Hacl_Hash_Blake2b_256.c @@ -108,8 +110,10 @@ set(SOURCES_std_vale ) set(SOURCES_aesni_pclmul ${PROJECT_SOURCE_DIR}/src/Hacl_Gf128_NI.c - ${PROJECT_SOURCE_DIR}/src/Hacl_AES_128_NI.c + ${PROJECT_SOURCE_DIR}/src/Hacl_AES_128_CTR32_NI.c ${PROJECT_SOURCE_DIR}/src/Hacl_AES_128_GCM_NI.c + ${PROJECT_SOURCE_DIR}/src/Hacl_AES_256_CTR32_NI.c + ${PROJECT_SOURCE_DIR}/src/Hacl_AES_256_GCM_NI.c ) set(INCLUDES ${PROJECT_SOURCE_DIR}/include/Hacl_NaCl.h @@ -370,13 +374,17 @@ set(PUBLIC_INCLUDES ${PROJECT_SOURCE_DIR}/include/EverCrypt_Poly1305.h ${PROJECT_SOURCE_DIR}/include/EverCrypt_AEAD.h ${PROJECT_SOURCE_DIR}/include/internal/Hacl_Lib.h - ${PROJECT_SOURCE_DIR}/include/internal/Hacl_AES_128_BitSlice.h - ${PROJECT_SOURCE_DIR}/include/Hacl_Gf128_PreComp.h - ${PROJECT_SOURCE_DIR}/include/Hacl_AES_128_BitSlice.h - ${PROJECT_SOURCE_DIR}/include/Hacl_AES_128_GCM_M32.h + ${PROJECT_SOURCE_DIR}/include/internal/Hacl_AES_128_CTR32_BitSlice.h + ${PROJECT_SOURCE_DIR}/include/Hacl_Gf128_CT64.h + ${PROJECT_SOURCE_DIR}/include/Hacl_AES_128_CTR32_BitSlice.h + ${PROJECT_SOURCE_DIR}/include/Hacl_AES_128_GCM_CT64.h + ${PROJECT_SOURCE_DIR}/include/Hacl_AES_256_CTR32_BitSlice.h + ${PROJECT_SOURCE_DIR}/include/Hacl_AES_256_GCM_CT64.h ${PROJECT_SOURCE_DIR}/include/Hacl_Gf128_NI.h - ${PROJECT_SOURCE_DIR}/include/Hacl_AES_128_NI.h + ${PROJECT_SOURCE_DIR}/include/Hacl_AES_128_CTR32_NI.h ${PROJECT_SOURCE_DIR}/include/Hacl_AES_128_GCM_NI.h + ${PROJECT_SOURCE_DIR}/include/Hacl_AES_256_CTR32_NI.h + ${PROJECT_SOURCE_DIR}/include/Hacl_AES_256_GCM_NI.h ) set(ALGORITHMS nacl diff --git a/include/Hacl_AES_128_BitSlice.h b/include/Hacl_AES_128_CTR32_BitSlice.h similarity index 73% rename from include/Hacl_AES_128_BitSlice.h rename to include/Hacl_AES_128_CTR32_BitSlice.h index 3a146a89..299c757c 100644 --- a/include/Hacl_AES_128_BitSlice.h +++ b/include/Hacl_AES_128_CTR32_BitSlice.h @@ -23,8 +23,8 @@ */ -#ifndef __Hacl_AES_128_BitSlice_H -#define __Hacl_AES_128_BitSlice_H +#ifndef __Hacl_AES_128_CTR32_BitSlice_H +#define __Hacl_AES_128_CTR32_BitSlice_H #if defined(__cplusplus) extern "C" { @@ -35,18 +35,19 @@ extern "C" { #include "krml/lowstar_endianness.h" #include "krml/internal/target.h" -typedef uint64_t *Hacl_AES_128_BitSlice_aes_ctx; +typedef uint64_t *Hacl_AES_128_CTR32_BitSlice_aes_ctx; -typedef uint8_t *Hacl_AES_128_BitSlice_skey; +typedef uint8_t *Hacl_AES_128_CTR32_BitSlice_skey; -void Hacl_AES_128_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce); +void Hacl_AES_128_CTR32_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce); -void Hacl_AES_128_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce); +void Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce); -void Hacl_AES_128_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter); +void +Hacl_AES_128_CTR32_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter); void -Hacl_AES_128_BitSlice_aes128_ctr_encrypt( +Hacl_AES_128_CTR32_BitSlice_aes128_ctr_encrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -56,7 +57,7 @@ Hacl_AES_128_BitSlice_aes128_ctr_encrypt( ); void -Hacl_AES_128_BitSlice_aes128_ctr_decrypt( +Hacl_AES_128_CTR32_BitSlice_aes128_ctr_decrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -69,5 +70,5 @@ Hacl_AES_128_BitSlice_aes128_ctr_decrypt( } #endif -#define __Hacl_AES_128_BitSlice_H_DEFINED +#define __Hacl_AES_128_CTR32_BitSlice_H_DEFINED #endif diff --git a/include/msvc/Hacl_AES_128_NI.h b/include/Hacl_AES_128_CTR32_NI.h similarity index 75% rename from include/msvc/Hacl_AES_128_NI.h rename to include/Hacl_AES_128_CTR32_NI.h index f3c148b5..7f854abd 100644 --- a/include/msvc/Hacl_AES_128_NI.h +++ b/include/Hacl_AES_128_CTR32_NI.h @@ -23,8 +23,8 @@ */ -#ifndef __Hacl_AES_128_NI_H -#define __Hacl_AES_128_NI_H +#ifndef __Hacl_AES_128_CTR32_NI_H +#define __Hacl_AES_128_CTR32_NI_H #if defined(__cplusplus) extern "C" { @@ -37,24 +37,29 @@ extern "C" { #include "libintvector.h" -typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_128_NI_aes_ctx; +typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_128_CTR32_NI_aes_ctx; -typedef uint8_t *Hacl_AES_128_NI_skey; +typedef uint8_t *Hacl_AES_128_CTR32_NI_skey; void -Hacl_AES_128_NI_aes128_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key, uint8_t *nonce); +Hacl_AES_128_CTR32_NI_aes128_init( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint8_t *key, + uint8_t *nonce +); -void Hacl_AES_128_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce); +void +Hacl_AES_128_CTR32_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce); void -Hacl_AES_128_NI_aes128_key_block( +Hacl_AES_128_CTR32_NI_aes128_key_block( uint8_t *kb, Lib_IntVector_Intrinsics_vec128 *ctx, uint32_t counter ); void -Hacl_AES_128_NI_aes128_ctr_encrypt( +Hacl_AES_128_CTR32_NI_aes128_ctr_encrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -64,7 +69,7 @@ Hacl_AES_128_NI_aes128_ctr_encrypt( ); void -Hacl_AES_128_NI_aes128_ctr_decrypt( +Hacl_AES_128_CTR32_NI_aes128_ctr_decrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -77,5 +82,5 @@ Hacl_AES_128_NI_aes128_ctr_decrypt( } #endif -#define __Hacl_AES_128_NI_H_DEFINED +#define __Hacl_AES_128_CTR32_NI_H_DEFINED #endif diff --git a/include/Hacl_AES_128_GCM_M32.h b/include/Hacl_AES_128_GCM_CT64.h similarity index 79% rename from include/Hacl_AES_128_GCM_M32.h rename to include/Hacl_AES_128_GCM_CT64.h index 29125377..edac7989 100644 --- a/include/Hacl_AES_128_GCM_M32.h +++ b/include/Hacl_AES_128_GCM_CT64.h @@ -23,8 +23,8 @@ */ -#ifndef __Hacl_AES_128_GCM_M32_H -#define __Hacl_AES_128_GCM_M32_H +#ifndef __Hacl_AES_128_GCM_CT64_H +#define __Hacl_AES_128_GCM_CT64_H #if defined(__cplusplus) extern "C" { @@ -35,17 +35,17 @@ extern "C" { #include "krml/lowstar_endianness.h" #include "krml/internal/target.h" -#include "Hacl_Gf128_PreComp.h" -#include "Hacl_AES_128_BitSlice.h" +#include "Hacl_Gf128_CT64.h" +#include "Hacl_AES_128_CTR32_BitSlice.h" -extern uint32_t Hacl_AES_128_GCM_M32_aes_gcm_ctx_len; +extern uint32_t Hacl_AES_128_GCM_CT64_aes_gcm_ctx_len; -typedef uint64_t *Hacl_AES_128_GCM_M32_aes_gcm_ctx; +typedef uint64_t *Hacl_AES_128_GCM_CT64_aes_gcm_ctx; -void Hacl_AES_128_GCM_M32_aes128_gcm_init(uint64_t *ctx, uint8_t *key); +void Hacl_AES_128_GCM_CT64_aes128_gcm_init(uint64_t *ctx, uint8_t *key); void -Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( +Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt( uint64_t *ctx, uint32_t len, uint8_t *out, @@ -57,7 +57,7 @@ Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( ); bool -Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( +Hacl_AES_128_GCM_CT64_aes128_gcm_decrypt( uint64_t *ctx, uint32_t len, uint8_t *out, @@ -72,5 +72,5 @@ Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( } #endif -#define __Hacl_AES_128_GCM_M32_H_DEFINED +#define __Hacl_AES_128_GCM_CT64_H_DEFINED #endif diff --git a/include/Hacl_AES_128_GCM_NI.h b/include/Hacl_AES_128_GCM_NI.h index ab520316..840637d2 100644 --- a/include/Hacl_AES_128_GCM_NI.h +++ b/include/Hacl_AES_128_GCM_NI.h @@ -36,7 +36,7 @@ extern "C" { #include "krml/internal/target.h" #include "Hacl_Gf128_NI.h" -#include "Hacl_AES_128_NI.h" +#include "Hacl_AES_128_CTR32_NI.h" #include "libintvector.h" typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_128_GCM_NI_aes_gcm_ctx; diff --git a/include/Hacl_AES_256_CTR32_BitSlice.h b/include/Hacl_AES_256_CTR32_BitSlice.h new file mode 100644 index 00000000..cb0be803 --- /dev/null +++ b/include/Hacl_AES_256_CTR32_BitSlice.h @@ -0,0 +1,83 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_256_CTR32_BitSlice_H +#define __Hacl_AES_256_CTR32_BitSlice_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +typedef uint64_t *Hacl_AES_256_CTR32_BitSlice_aes_ctx; + +typedef uint8_t *Hacl_AES_256_CTR32_BitSlice_skey; + +void Hacl_AES_256_CTR32_BitSlice_aes256_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce); + +void Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(uint64_t *ctx, uint8_t *nonce); + +void +Hacl_AES_256_CTR32_BitSlice_aes256_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter); + +void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t c +); + +void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_256_CTR32_BitSlice_H_DEFINED +#endif diff --git a/include/Hacl_AES_256_CTR32_NI.h b/include/Hacl_AES_256_CTR32_NI.h new file mode 100644 index 00000000..6b33b030 --- /dev/null +++ b/include/Hacl_AES_256_CTR32_NI.h @@ -0,0 +1,95 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_256_CTR32_NI_H +#define __Hacl_AES_256_CTR32_NI_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "libintvector.h" + +typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_256_CTR32_NI_aes_ctx; + +typedef uint8_t *Hacl_AES_256_CTR32_NI_skey; + +void +Hacl_AES_256_CTR32_NI_aes256_init( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint8_t *key, + uint8_t *nonce +); + +void +Hacl_AES_256_CTR32_NI_aes256_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce); + +void +Hacl_AES_256_CTR32_NI_aes256_key_block( + uint8_t *kb, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t counter +); + +void +Hacl_AES_256_CTR32_NI_aes256_ctr( + uint32_t len, + uint8_t *out, + uint8_t *inp, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t c +); + +void +Hacl_AES_256_CTR32_NI_aes256_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +void +Hacl_AES_256_CTR32_NI_aes256_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_256_CTR32_NI_H_DEFINED +#endif diff --git a/include/msvc/Hacl_AES_128_GCM_M32.h b/include/Hacl_AES_256_GCM_CT64.h similarity index 79% rename from include/msvc/Hacl_AES_128_GCM_M32.h rename to include/Hacl_AES_256_GCM_CT64.h index 29125377..3505f4fc 100644 --- a/include/msvc/Hacl_AES_128_GCM_M32.h +++ b/include/Hacl_AES_256_GCM_CT64.h @@ -23,8 +23,8 @@ */ -#ifndef __Hacl_AES_128_GCM_M32_H -#define __Hacl_AES_128_GCM_M32_H +#ifndef __Hacl_AES_256_GCM_CT64_H +#define __Hacl_AES_256_GCM_CT64_H #if defined(__cplusplus) extern "C" { @@ -35,17 +35,17 @@ extern "C" { #include "krml/lowstar_endianness.h" #include "krml/internal/target.h" -#include "Hacl_Gf128_PreComp.h" -#include "Hacl_AES_128_BitSlice.h" +#include "Hacl_Gf128_CT64.h" +#include "Hacl_AES_256_CTR32_BitSlice.h" -extern uint32_t Hacl_AES_128_GCM_M32_aes_gcm_ctx_len; +extern uint32_t Hacl_AES_256_GCM_CT64_aes_gcm_ctx_len; -typedef uint64_t *Hacl_AES_128_GCM_M32_aes_gcm_ctx; +typedef uint64_t *Hacl_AES_256_GCM_CT64_aes_gcm_ctx; -void Hacl_AES_128_GCM_M32_aes128_gcm_init(uint64_t *ctx, uint8_t *key); +void Hacl_AES_256_GCM_CT64_aes256_gcm_init(uint64_t *ctx, uint8_t *key); void -Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( +Hacl_AES_256_GCM_CT64_aes256_gcm_encrypt( uint64_t *ctx, uint32_t len, uint8_t *out, @@ -57,7 +57,7 @@ Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( ); bool -Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( +Hacl_AES_256_GCM_CT64_aes256_gcm_decrypt( uint64_t *ctx, uint32_t len, uint8_t *out, @@ -72,5 +72,5 @@ Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( } #endif -#define __Hacl_AES_128_GCM_M32_H_DEFINED +#define __Hacl_AES_256_GCM_CT64_H_DEFINED #endif diff --git a/include/Hacl_AES_256_GCM_NI.h b/include/Hacl_AES_256_GCM_NI.h new file mode 100644 index 00000000..7408c4b8 --- /dev/null +++ b/include/Hacl_AES_256_GCM_NI.h @@ -0,0 +1,75 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_256_GCM_NI_H +#define __Hacl_AES_256_GCM_NI_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "Hacl_Gf128_NI.h" +#include "Hacl_AES_256_CTR32_NI.h" +#include "libintvector.h" + +typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_256_GCM_NI_aes_gcm_ctx; + +void Hacl_AES_256_GCM_NI_aes256_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key); + +void +Hacl_AES_256_GCM_NI_aes256_gcm_encrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +bool +Hacl_AES_256_GCM_NI_aes256_gcm_decrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_256_GCM_NI_H_DEFINED +#endif diff --git a/include/msvc/Hacl_Gf128_PreComp.h b/include/Hacl_Gf128_CT64.h similarity index 74% rename from include/msvc/Hacl_Gf128_PreComp.h rename to include/Hacl_Gf128_CT64.h index 3d67add1..d9eb236e 100644 --- a/include/msvc/Hacl_Gf128_PreComp.h +++ b/include/Hacl_Gf128_CT64.h @@ -23,8 +23,8 @@ */ -#ifndef __Hacl_Gf128_PreComp_H -#define __Hacl_Gf128_PreComp_H +#ifndef __Hacl_Gf128_CT64_H +#define __Hacl_Gf128_CT64_H #if defined(__cplusplus) extern "C" { @@ -35,20 +35,20 @@ extern "C" { #include "krml/lowstar_endianness.h" #include "krml/internal/target.h" -void Hacl_Gf128_PreComp_gcm_init(uint64_t *ctx, uint8_t *key); +void Hacl_Gf128_CT64_gcm_init(uint64_t *ctx, uint8_t *key); -void Hacl_Gf128_PreComp_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text); +void Hacl_Gf128_CT64_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text); extern void -(*Hacl_Gf128_PreComp_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2); +(*Hacl_Gf128_CT64_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2); -void Hacl_Gf128_PreComp_gcm_emit(uint8_t *tag, uint64_t *ctx); +void Hacl_Gf128_CT64_gcm_emit(uint8_t *tag, uint64_t *ctx); -void Hacl_Gf128_PreComp_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key); +void Hacl_Gf128_CT64_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key); #if defined(__cplusplus) } #endif -#define __Hacl_Gf128_PreComp_H_DEFINED +#define __Hacl_Gf128_CT64_H_DEFINED #endif diff --git a/include/msvc/internal/Hacl_AES_128_BitSlice.h b/include/internal/Hacl_AES_128_CTR32_BitSlice.h similarity index 92% rename from include/msvc/internal/Hacl_AES_128_BitSlice.h rename to include/internal/Hacl_AES_128_CTR32_BitSlice.h index 3b95bb9b..1d5eee82 100644 --- a/include/msvc/internal/Hacl_AES_128_BitSlice.h +++ b/include/internal/Hacl_AES_128_CTR32_BitSlice.h @@ -23,8 +23,8 @@ */ -#ifndef __internal_Hacl_AES_128_BitSlice_H -#define __internal_Hacl_AES_128_BitSlice_H +#ifndef __internal_Hacl_AES_128_CTR32_BitSlice_H +#define __internal_Hacl_AES_128_CTR32_BitSlice_H #if defined(__cplusplus) extern "C" { @@ -36,7 +36,7 @@ extern "C" { #include "krml/internal/target.h" #include "internal/Hacl_Lib.h" -#include "../Hacl_AES_128_BitSlice.h" +#include "../Hacl_AES_128_CTR32_BitSlice.h" void Hacl_Impl_AES_CoreBitSlice_store_block0(uint8_t *out, uint64_t *inp); @@ -79,5 +79,5 @@ Hacl_Impl_AES_Generic_aes256_ctr_bitslice( } #endif -#define __internal_Hacl_AES_128_BitSlice_H_DEFINED +#define __internal_Hacl_AES_128_CTR32_BitSlice_H_DEFINED #endif diff --git a/include/internal/Hacl_Spec.h b/include/internal/Hacl_Spec.h index e3ddd18f..5bbbe8bf 100644 --- a/include/internal/Hacl_Spec.h +++ b/include/internal/Hacl_Spec.h @@ -42,8 +42,8 @@ extern "C" { #define Spec_Cipher_Expansion_Vale_AES256 2 #define Spec_Cipher_Expansion_AESNI_PCLMUL_AES128 3 #define Spec_Cipher_Expansion_AESNI_PCLMUL_AES256 4 -#define Spec_Cipher_Expansion_M32_AES128 5 -#define Spec_Cipher_Expansion_M32_AES256 6 +#define Spec_Cipher_Expansion_CT64_AES128 5 +#define Spec_Cipher_Expansion_CT64_AES256 6 typedef uint8_t Spec_Cipher_Expansion_impl; diff --git a/include/msvc/Hacl_AES_128_BitSlice.h b/include/msvc/Hacl_AES_128_CTR32_BitSlice.h similarity index 73% rename from include/msvc/Hacl_AES_128_BitSlice.h rename to include/msvc/Hacl_AES_128_CTR32_BitSlice.h index 3a146a89..299c757c 100644 --- a/include/msvc/Hacl_AES_128_BitSlice.h +++ b/include/msvc/Hacl_AES_128_CTR32_BitSlice.h @@ -23,8 +23,8 @@ */ -#ifndef __Hacl_AES_128_BitSlice_H -#define __Hacl_AES_128_BitSlice_H +#ifndef __Hacl_AES_128_CTR32_BitSlice_H +#define __Hacl_AES_128_CTR32_BitSlice_H #if defined(__cplusplus) extern "C" { @@ -35,18 +35,19 @@ extern "C" { #include "krml/lowstar_endianness.h" #include "krml/internal/target.h" -typedef uint64_t *Hacl_AES_128_BitSlice_aes_ctx; +typedef uint64_t *Hacl_AES_128_CTR32_BitSlice_aes_ctx; -typedef uint8_t *Hacl_AES_128_BitSlice_skey; +typedef uint8_t *Hacl_AES_128_CTR32_BitSlice_skey; -void Hacl_AES_128_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce); +void Hacl_AES_128_CTR32_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce); -void Hacl_AES_128_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce); +void Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce); -void Hacl_AES_128_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter); +void +Hacl_AES_128_CTR32_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter); void -Hacl_AES_128_BitSlice_aes128_ctr_encrypt( +Hacl_AES_128_CTR32_BitSlice_aes128_ctr_encrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -56,7 +57,7 @@ Hacl_AES_128_BitSlice_aes128_ctr_encrypt( ); void -Hacl_AES_128_BitSlice_aes128_ctr_decrypt( +Hacl_AES_128_CTR32_BitSlice_aes128_ctr_decrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -69,5 +70,5 @@ Hacl_AES_128_BitSlice_aes128_ctr_decrypt( } #endif -#define __Hacl_AES_128_BitSlice_H_DEFINED +#define __Hacl_AES_128_CTR32_BitSlice_H_DEFINED #endif diff --git a/include/Hacl_AES_128_NI.h b/include/msvc/Hacl_AES_128_CTR32_NI.h similarity index 75% rename from include/Hacl_AES_128_NI.h rename to include/msvc/Hacl_AES_128_CTR32_NI.h index f3c148b5..7f854abd 100644 --- a/include/Hacl_AES_128_NI.h +++ b/include/msvc/Hacl_AES_128_CTR32_NI.h @@ -23,8 +23,8 @@ */ -#ifndef __Hacl_AES_128_NI_H -#define __Hacl_AES_128_NI_H +#ifndef __Hacl_AES_128_CTR32_NI_H +#define __Hacl_AES_128_CTR32_NI_H #if defined(__cplusplus) extern "C" { @@ -37,24 +37,29 @@ extern "C" { #include "libintvector.h" -typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_128_NI_aes_ctx; +typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_128_CTR32_NI_aes_ctx; -typedef uint8_t *Hacl_AES_128_NI_skey; +typedef uint8_t *Hacl_AES_128_CTR32_NI_skey; void -Hacl_AES_128_NI_aes128_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key, uint8_t *nonce); +Hacl_AES_128_CTR32_NI_aes128_init( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint8_t *key, + uint8_t *nonce +); -void Hacl_AES_128_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce); +void +Hacl_AES_128_CTR32_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce); void -Hacl_AES_128_NI_aes128_key_block( +Hacl_AES_128_CTR32_NI_aes128_key_block( uint8_t *kb, Lib_IntVector_Intrinsics_vec128 *ctx, uint32_t counter ); void -Hacl_AES_128_NI_aes128_ctr_encrypt( +Hacl_AES_128_CTR32_NI_aes128_ctr_encrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -64,7 +69,7 @@ Hacl_AES_128_NI_aes128_ctr_encrypt( ); void -Hacl_AES_128_NI_aes128_ctr_decrypt( +Hacl_AES_128_CTR32_NI_aes128_ctr_decrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -77,5 +82,5 @@ Hacl_AES_128_NI_aes128_ctr_decrypt( } #endif -#define __Hacl_AES_128_NI_H_DEFINED +#define __Hacl_AES_128_CTR32_NI_H_DEFINED #endif diff --git a/include/msvc/Hacl_AES_128_GCM_CT64.h b/include/msvc/Hacl_AES_128_GCM_CT64.h new file mode 100644 index 00000000..edac7989 --- /dev/null +++ b/include/msvc/Hacl_AES_128_GCM_CT64.h @@ -0,0 +1,76 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_128_GCM_CT64_H +#define __Hacl_AES_128_GCM_CT64_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "Hacl_Gf128_CT64.h" +#include "Hacl_AES_128_CTR32_BitSlice.h" + +extern uint32_t Hacl_AES_128_GCM_CT64_aes_gcm_ctx_len; + +typedef uint64_t *Hacl_AES_128_GCM_CT64_aes_gcm_ctx; + +void Hacl_AES_128_GCM_CT64_aes128_gcm_init(uint64_t *ctx, uint8_t *key); + +void +Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +bool +Hacl_AES_128_GCM_CT64_aes128_gcm_decrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_128_GCM_CT64_H_DEFINED +#endif diff --git a/include/msvc/Hacl_AES_128_GCM_NI.h b/include/msvc/Hacl_AES_128_GCM_NI.h index ab520316..840637d2 100644 --- a/include/msvc/Hacl_AES_128_GCM_NI.h +++ b/include/msvc/Hacl_AES_128_GCM_NI.h @@ -36,7 +36,7 @@ extern "C" { #include "krml/internal/target.h" #include "Hacl_Gf128_NI.h" -#include "Hacl_AES_128_NI.h" +#include "Hacl_AES_128_CTR32_NI.h" #include "libintvector.h" typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_128_GCM_NI_aes_gcm_ctx; diff --git a/include/msvc/Hacl_AES_256_CTR32_BitSlice.h b/include/msvc/Hacl_AES_256_CTR32_BitSlice.h new file mode 100644 index 00000000..cb0be803 --- /dev/null +++ b/include/msvc/Hacl_AES_256_CTR32_BitSlice.h @@ -0,0 +1,83 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_256_CTR32_BitSlice_H +#define __Hacl_AES_256_CTR32_BitSlice_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +typedef uint64_t *Hacl_AES_256_CTR32_BitSlice_aes_ctx; + +typedef uint8_t *Hacl_AES_256_CTR32_BitSlice_skey; + +void Hacl_AES_256_CTR32_BitSlice_aes256_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce); + +void Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(uint64_t *ctx, uint8_t *nonce); + +void +Hacl_AES_256_CTR32_BitSlice_aes256_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter); + +void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t c +); + +void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_256_CTR32_BitSlice_H_DEFINED +#endif diff --git a/include/msvc/Hacl_AES_256_CTR32_NI.h b/include/msvc/Hacl_AES_256_CTR32_NI.h new file mode 100644 index 00000000..6b33b030 --- /dev/null +++ b/include/msvc/Hacl_AES_256_CTR32_NI.h @@ -0,0 +1,95 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_256_CTR32_NI_H +#define __Hacl_AES_256_CTR32_NI_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "libintvector.h" + +typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_256_CTR32_NI_aes_ctx; + +typedef uint8_t *Hacl_AES_256_CTR32_NI_skey; + +void +Hacl_AES_256_CTR32_NI_aes256_init( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint8_t *key, + uint8_t *nonce +); + +void +Hacl_AES_256_CTR32_NI_aes256_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce); + +void +Hacl_AES_256_CTR32_NI_aes256_key_block( + uint8_t *kb, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t counter +); + +void +Hacl_AES_256_CTR32_NI_aes256_ctr( + uint32_t len, + uint8_t *out, + uint8_t *inp, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t c +); + +void +Hacl_AES_256_CTR32_NI_aes256_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +void +Hacl_AES_256_CTR32_NI_aes256_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_256_CTR32_NI_H_DEFINED +#endif diff --git a/include/msvc/Hacl_AES_256_GCM_CT64.h b/include/msvc/Hacl_AES_256_GCM_CT64.h new file mode 100644 index 00000000..3505f4fc --- /dev/null +++ b/include/msvc/Hacl_AES_256_GCM_CT64.h @@ -0,0 +1,76 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_256_GCM_CT64_H +#define __Hacl_AES_256_GCM_CT64_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "Hacl_Gf128_CT64.h" +#include "Hacl_AES_256_CTR32_BitSlice.h" + +extern uint32_t Hacl_AES_256_GCM_CT64_aes_gcm_ctx_len; + +typedef uint64_t *Hacl_AES_256_GCM_CT64_aes_gcm_ctx; + +void Hacl_AES_256_GCM_CT64_aes256_gcm_init(uint64_t *ctx, uint8_t *key); + +void +Hacl_AES_256_GCM_CT64_aes256_gcm_encrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +bool +Hacl_AES_256_GCM_CT64_aes256_gcm_decrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_256_GCM_CT64_H_DEFINED +#endif diff --git a/include/msvc/Hacl_AES_256_GCM_NI.h b/include/msvc/Hacl_AES_256_GCM_NI.h new file mode 100644 index 00000000..7408c4b8 --- /dev/null +++ b/include/msvc/Hacl_AES_256_GCM_NI.h @@ -0,0 +1,75 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef __Hacl_AES_256_GCM_NI_H +#define __Hacl_AES_256_GCM_NI_H + +#if defined(__cplusplus) +extern "C" { +#endif + +#include +#include "krml/internal/types.h" +#include "krml/lowstar_endianness.h" +#include "krml/internal/target.h" + +#include "Hacl_Gf128_NI.h" +#include "Hacl_AES_256_CTR32_NI.h" +#include "libintvector.h" + +typedef Lib_IntVector_Intrinsics_vec128 *Hacl_AES_256_GCM_NI_aes_gcm_ctx; + +void Hacl_AES_256_GCM_NI_aes256_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key); + +void +Hacl_AES_256_GCM_NI_aes256_gcm_encrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +bool +Hacl_AES_256_GCM_NI_aes256_gcm_decrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +); + +#if defined(__cplusplus) +} +#endif + +#define __Hacl_AES_256_GCM_NI_H_DEFINED +#endif diff --git a/include/Hacl_Gf128_PreComp.h b/include/msvc/Hacl_Gf128_CT64.h similarity index 74% rename from include/Hacl_Gf128_PreComp.h rename to include/msvc/Hacl_Gf128_CT64.h index 3d67add1..d9eb236e 100644 --- a/include/Hacl_Gf128_PreComp.h +++ b/include/msvc/Hacl_Gf128_CT64.h @@ -23,8 +23,8 @@ */ -#ifndef __Hacl_Gf128_PreComp_H -#define __Hacl_Gf128_PreComp_H +#ifndef __Hacl_Gf128_CT64_H +#define __Hacl_Gf128_CT64_H #if defined(__cplusplus) extern "C" { @@ -35,20 +35,20 @@ extern "C" { #include "krml/lowstar_endianness.h" #include "krml/internal/target.h" -void Hacl_Gf128_PreComp_gcm_init(uint64_t *ctx, uint8_t *key); +void Hacl_Gf128_CT64_gcm_init(uint64_t *ctx, uint8_t *key); -void Hacl_Gf128_PreComp_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text); +void Hacl_Gf128_CT64_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text); extern void -(*Hacl_Gf128_PreComp_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2); +(*Hacl_Gf128_CT64_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2); -void Hacl_Gf128_PreComp_gcm_emit(uint8_t *tag, uint64_t *ctx); +void Hacl_Gf128_CT64_gcm_emit(uint8_t *tag, uint64_t *ctx); -void Hacl_Gf128_PreComp_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key); +void Hacl_Gf128_CT64_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key); #if defined(__cplusplus) } #endif -#define __Hacl_Gf128_PreComp_H_DEFINED +#define __Hacl_Gf128_CT64_H_DEFINED #endif diff --git a/include/internal/Hacl_AES_128_BitSlice.h b/include/msvc/internal/Hacl_AES_128_CTR32_BitSlice.h similarity index 92% rename from include/internal/Hacl_AES_128_BitSlice.h rename to include/msvc/internal/Hacl_AES_128_CTR32_BitSlice.h index 3b95bb9b..1d5eee82 100644 --- a/include/internal/Hacl_AES_128_BitSlice.h +++ b/include/msvc/internal/Hacl_AES_128_CTR32_BitSlice.h @@ -23,8 +23,8 @@ */ -#ifndef __internal_Hacl_AES_128_BitSlice_H -#define __internal_Hacl_AES_128_BitSlice_H +#ifndef __internal_Hacl_AES_128_CTR32_BitSlice_H +#define __internal_Hacl_AES_128_CTR32_BitSlice_H #if defined(__cplusplus) extern "C" { @@ -36,7 +36,7 @@ extern "C" { #include "krml/internal/target.h" #include "internal/Hacl_Lib.h" -#include "../Hacl_AES_128_BitSlice.h" +#include "../Hacl_AES_128_CTR32_BitSlice.h" void Hacl_Impl_AES_CoreBitSlice_store_block0(uint8_t *out, uint64_t *inp); @@ -79,5 +79,5 @@ Hacl_Impl_AES_Generic_aes256_ctr_bitslice( } #endif -#define __internal_Hacl_AES_128_BitSlice_H_DEFINED +#define __internal_Hacl_AES_128_CTR32_BitSlice_H_DEFINED #endif diff --git a/include/msvc/internal/Hacl_Spec.h b/include/msvc/internal/Hacl_Spec.h index e3ddd18f..5bbbe8bf 100644 --- a/include/msvc/internal/Hacl_Spec.h +++ b/include/msvc/internal/Hacl_Spec.h @@ -42,8 +42,8 @@ extern "C" { #define Spec_Cipher_Expansion_Vale_AES256 2 #define Spec_Cipher_Expansion_AESNI_PCLMUL_AES128 3 #define Spec_Cipher_Expansion_AESNI_PCLMUL_AES256 4 -#define Spec_Cipher_Expansion_M32_AES128 5 -#define Spec_Cipher_Expansion_M32_AES256 6 +#define Spec_Cipher_Expansion_CT64_AES128 5 +#define Spec_Cipher_Expansion_CT64_AES256 6 typedef uint8_t Spec_Cipher_Expansion_impl; diff --git a/src/EverCrypt_AEAD.c b/src/EverCrypt_AEAD.c index 6c21c319..86630862 100644 --- a/src/EverCrypt_AEAD.c +++ b/src/EverCrypt_AEAD.c @@ -28,8 +28,10 @@ #include "internal/Vale.h" #ifdef HACL_CAN_COMPILE_AESNI_PCLMUL #include "Hacl_AES_128_GCM_NI.h" +#include "Hacl_AES_256_GCM_NI.h" #endif -#include "Hacl_AES_128_GCM_M32.h" +#include "Hacl_AES_128_GCM_CT64.h" +#include "Hacl_AES_256_GCM_CT64.h" #include "internal/Hacl_Spec.h" #include "config.h" #include "hacl-cpu-features.h" @@ -69,13 +71,13 @@ Spec_Agile_AEAD_alg EverCrypt_AEAD_alg_of_state(EverCrypt_AEAD_state_s *s) } case Spec_Cipher_Expansion_Vale_AES128: case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: - case Spec_Cipher_Expansion_M32_AES128: + case Spec_Cipher_Expansion_CT64_AES128: { return Spec_Agile_AEAD_AES128_GCM; } case Spec_Cipher_Expansion_Vale_AES256: case Spec_Cipher_Expansion_AESNI_PCLMUL_AES256: - case Spec_Cipher_Expansion_M32_AES256: + case Spec_Cipher_Expansion_CT64_AES256: { return Spec_Agile_AEAD_AES256_GCM; } @@ -122,10 +124,11 @@ create_in_aes128_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) return EverCrypt_Error_Success; } else - #elif HACL_CAN_COMPILE_AESNI_PCLMUL + #endif + #if HACL_CAN_COMPILE_AESNI_PCLMUL if (hacl_aesgcm_support() != 0) { - uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)352U, sizeof (uint8_t)); + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)288U, sizeof (uint8_t)); Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx, k); EverCrypt_AEAD_state_s @@ -137,12 +140,12 @@ create_in_aes128_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) else #endif { - uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)3168U, sizeof (uint8_t)); + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)928U, sizeof (uint8_t)); uint64_t *aes_gcm_ctx = (uint64_t *)ek; - Hacl_AES_128_GCM_M32_aes128_gcm_init(aes_gcm_ctx, k); + Hacl_AES_128_GCM_CT64_aes128_gcm_init(aes_gcm_ctx, k); EverCrypt_AEAD_state_s *p = (EverCrypt_AEAD_state_s *)KRML_HOST_MALLOC(sizeof (EverCrypt_AEAD_state_s)); - p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_M32_AES128, .ek = ek }); + p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_CT64_AES128, .ek = ek }); *dst = p; return EverCrypt_Error_Success; } @@ -151,12 +154,12 @@ create_in_aes128_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) static EverCrypt_Error_error_code create_in_aes256_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) { + #if HACL_CAN_COMPILE_VALE bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)544U, sizeof (uint8_t)); @@ -170,8 +173,32 @@ create_in_aes256_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) *dst = p; return EverCrypt_Error_Success; } + else #endif - return EverCrypt_Error_UnsupportedAlgorithm; + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)352U, sizeof (uint8_t)); + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_256_GCM_NI_aes256_gcm_init(aes_gcm_ctx, k); + EverCrypt_AEAD_state_s + *p = (EverCrypt_AEAD_state_s *)KRML_HOST_MALLOC(sizeof (EverCrypt_AEAD_state_s)); + p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES256, .ek = ek }); + *dst = p; + return EverCrypt_Error_Success; + } + else + #endif + { + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)1184U, sizeof (uint8_t)); + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + Hacl_AES_256_GCM_CT64_aes256_gcm_init(aes_gcm_ctx, k); + EverCrypt_AEAD_state_s + *p = (EverCrypt_AEAD_state_s *)KRML_HOST_MALLOC(sizeof (EverCrypt_AEAD_state_s)); + p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_CT64_AES256, .ek = ek }); + *dst = p; + return EverCrypt_Error_Success; + } } /** @@ -379,7 +406,7 @@ encrypt_aes128_gcm_aesni_pclmul( } static EverCrypt_Error_error_code -encrypt_aes128_gcm_m32( +encrypt_aes128_gcm_ct64( EverCrypt_AEAD_state_s *s, uint8_t *iv, uint32_t iv_len, @@ -403,7 +430,7 @@ encrypt_aes128_gcm_m32( uint8_t *ek = scrut.ek; uint64_t *aes_gcm_ctx = (uint64_t *)ek; uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); - Hacl_AES_128_GCM_M32_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); memcpy(cipher, out, plain_len); memcpy(tag, out + plain_len, 16); KRML_HOST_FREE(out); @@ -534,6 +561,78 @@ encrypt_aes256_gcm( #endif } +static EverCrypt_Error_error_code +encrypt_aes256_gcm_aesni_pclmul( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *plain, + uint32_t plain_len, + uint8_t *cipher, + uint8_t *tag +) +{ + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_NI_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + return EverCrypt_Error_Success; + #else + KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", + __FILE__, + __LINE__, + "statically unreachable"); + KRML_HOST_EXIT(255U); + #endif +} + +static EverCrypt_Error_error_code +encrypt_aes256_gcm_ct64( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *plain, + uint32_t plain_len, + uint8_t *cipher, + uint8_t *tag +) +{ + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_CT64_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + return EverCrypt_Error_Success; +} + /** Encrypt and authenticate a message (`plain`) with associated data (`ad`). @@ -579,10 +678,26 @@ EverCrypt_AEAD_encrypt( { return encrypt_aes128_gcm(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); } + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + { + return encrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } + case Spec_Cipher_Expansion_CT64_AES128: + { + return encrypt_aes128_gcm_ct64(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } case Spec_Cipher_Expansion_Vale_AES256: { return encrypt_aes256_gcm(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); } + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES256: + { + return encrypt_aes256_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } + case Spec_Cipher_Expansion_CT64_AES256: + { + return encrypt_aes256_gcm_ct64(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } case Spec_Cipher_Expansion_Hacl_CHACHA20: { if (iv_len != (uint32_t)12U) @@ -592,14 +707,6 @@ EverCrypt_AEAD_encrypt( EverCrypt_Chacha20Poly1305_aead_encrypt(ek, iv, ad_len, ad, plain_len, plain, cipher, tag); return EverCrypt_Error_Success; } - case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: - { - return encrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); - } - case Spec_Cipher_Expansion_M32_AES128: - { - return encrypt_aes128_gcm_m32(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); - } default: { KRML_HOST_EPRINTF("KaRaMeL incomplete match at %s:%d\n", __FILE__, __LINE__); @@ -609,12 +716,9 @@ EverCrypt_AEAD_encrypt( } /** -WARNING: this function doesn't perform any dynamic - hardware check. You MUST make sure your hardware supports the - implementation of AESGCM. Besides, this function was not designed - for cross-compilation: if you compile it on a system which doesn't - support Vale, it will compile it to a function which makes the - program exit. +WARNING: this function doesn't perform any dynamic hardware + check. You need to configure particular flags to control hardware + fetures that this function uses on target processor architecture. */ EverCrypt_Error_error_code EverCrypt_AEAD_encrypt_expand_aes128_gcm_no_check( @@ -742,23 +846,68 @@ EverCrypt_AEAD_encrypt_expand_aes128_gcm_no_check( (uint32_t)(uint64_t)plain_len % (uint32_t)16U * sizeof (uint8_t)); r = EverCrypt_Error_Success; } - return EverCrypt_Error_Success; + #elif HACL_CAN_COMPILE_AESNI_PCLMUL + uint8_t ek[288U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_NI_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } #else - KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", - __FILE__, - __LINE__, - "EverCrypt was compiled on a system which doesn\'t support Vale"); - KRML_HOST_EXIT(255U); + uint8_t ek[928U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_128_GCM_CT64_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } #endif + return EverCrypt_Error_Success; } /** -WARNING: this function doesn't perform any dynamic - hardware check. You MUST make sure your hardware supports the - implementation of AESGCM. Besides, this function was not designed - for cross-compilation: if you compile it on a system which doesn't - support Vale, it will compile it to a function which makes the - program exit. +WARNING: this function doesn't perform any dynamic hardware + check. You need to configure particular flags to control hardware + fetures that this function uses on target processor architecture. */ EverCrypt_Error_error_code EverCrypt_AEAD_encrypt_expand_aes256_gcm_no_check( @@ -886,14 +1035,62 @@ EverCrypt_AEAD_encrypt_expand_aes256_gcm_no_check( (uint32_t)(uint64_t)plain_len % (uint32_t)16U * sizeof (uint8_t)); r = EverCrypt_Error_Success; } - return EverCrypt_Error_Success; + #elif HACL_CAN_COMPILE_AESNI_PCLMUL + uint8_t ek[352U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_256_GCM_NI_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_NI_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } #else - KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", - __FILE__, - __LINE__, - "EverCrypt was compiled on a system which doesn\'t support Vale"); - KRML_HOST_EXIT(255U); + uint8_t ek[1184U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_256_GCM_CT64_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_CT64_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } #endif + return EverCrypt_Error_Success; } EverCrypt_Error_error_code @@ -909,12 +1106,12 @@ EverCrypt_AEAD_encrypt_expand_aes128_gcm( uint8_t *tag ) { + #if HACL_CAN_COMPILE_VALE bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t ek[480U] = { 0U }; @@ -1029,10 +1226,70 @@ EverCrypt_AEAD_encrypt_expand_aes128_gcm( (uint32_t)(uint64_t)plain_len % (uint32_t)16U * sizeof (uint8_t)); r = EverCrypt_Error_Success; } - return EverCrypt_Error_Success; } + else #endif - return EverCrypt_Error_UnsupportedAlgorithm; + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t ek[288U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_NI_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } + } + else + #endif + { + uint8_t ek[928U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_128_GCM_CT64_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } + } + return EverCrypt_Error_Success; } EverCrypt_Error_error_code @@ -1048,12 +1305,12 @@ EverCrypt_AEAD_encrypt_expand_aes256_gcm( uint8_t *tag ) { + #if HACL_CAN_COMPILE_VALE bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t ek[544U] = { 0U }; @@ -1168,10 +1425,70 @@ EverCrypt_AEAD_encrypt_expand_aes256_gcm( (uint32_t)(uint64_t)plain_len % (uint32_t)16U * sizeof (uint8_t)); r = EverCrypt_Error_Success; } - return EverCrypt_Error_Success; } - #endif - return EverCrypt_Error_UnsupportedAlgorithm; + else + #endif + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t ek[352U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_256_GCM_NI_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_NI_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } + } + else + #endif + { + uint8_t ek[1184U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_256_GCM_CT64_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_CT64_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } + } + return EverCrypt_Error_Success; } EverCrypt_Error_error_code @@ -1439,7 +1756,7 @@ decrypt_aes128_gcm_aesni_pclmul( } static EverCrypt_Error_error_code -decrypt_aes128_gcm_m32( +decrypt_aes128_gcm_ct64( EverCrypt_AEAD_state_s *s, uint8_t *iv, uint32_t iv_len, @@ -1465,7 +1782,7 @@ decrypt_aes128_gcm_m32( uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); memcpy(in, cipher, cipher_len); memcpy(in + cipher_len, tag, 16); - bool r = Hacl_AES_128_GCM_M32_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + bool r = Hacl_AES_128_GCM_CT64_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); KRML_HOST_FREE(in); if (r) { @@ -1608,6 +1925,86 @@ decrypt_aes256_gcm( #endif } +static EverCrypt_Error_error_code +decrypt_aes256_gcm_aesni_pclmul( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *cipher, + uint32_t cipher_len, + uint8_t *tag, + uint8_t *dst +) +{ + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_NI_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + #else + KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", + __FILE__, + __LINE__, + "statically unreachable"); + KRML_HOST_EXIT(255U); + #endif +} + +static EverCrypt_Error_error_code +decrypt_aes256_gcm_ct64( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *cipher, + uint32_t cipher_len, + uint8_t *tag, + uint8_t *dst +) +{ + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_CT64_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; +} + static EverCrypt_Error_error_code decrypt_chacha20_poly1305( EverCrypt_AEAD_state_s *s, @@ -1696,21 +2093,29 @@ EverCrypt_AEAD_decrypt( { return decrypt_aes128_gcm(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + { + return decrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + } + case Spec_Cipher_Expansion_CT64_AES128: + { + return decrypt_aes128_gcm_ct64(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + } case Spec_Cipher_Expansion_Vale_AES256: { return decrypt_aes256_gcm(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } - case Spec_Cipher_Expansion_Hacl_CHACHA20: + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES256: { - return decrypt_chacha20_poly1305(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + return decrypt_aes256_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } - case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + case Spec_Cipher_Expansion_CT64_AES256: { - return decrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + return decrypt_aes256_gcm_ct64(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } - case Spec_Cipher_Expansion_M32_AES128: + case Spec_Cipher_Expansion_Hacl_CHACHA20: { - return decrypt_aes128_gcm_m32(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + return decrypt_chacha20_poly1305(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } default: { @@ -1721,12 +2126,9 @@ EverCrypt_AEAD_decrypt( } /** -WARNING: this function doesn't perform any dynamic - hardware check. You MUST make sure your hardware supports the - implementation of AESGCM. Besides, this function was not designed - for cross-compilation: if you compile it on a system which doesn't - support Vale, it will compile it to a function which makes the - program exit. +WARNING: this function doesn't perform any dynamic hardware + check. You need to configure particular flags to control hardware + fetures that this function uses on target processor architecture. */ EverCrypt_Error_error_code EverCrypt_AEAD_decrypt_expand_aes128_gcm_no_check( @@ -1860,22 +2262,67 @@ EverCrypt_AEAD_decrypt_expand_aes128_gcm_no_check( return EverCrypt_Error_Success; } return EverCrypt_Error_AuthenticationFailure; + #elif HACL_CAN_COMPILE_AESNI_PCLMUL + uint8_t ek[288U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_NI_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; #else - KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", - __FILE__, - __LINE__, - "EverCrypt was compiled on a system which doesn\'t support Vale"); - KRML_HOST_EXIT(255U); + uint8_t ek[928U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_128_GCM_CT64_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_CT64_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; #endif } /** -WARNING: this function doesn't perform any dynamic - hardware check. You MUST make sure your hardware supports the - implementation of AESGCM. Besides, this function was not designed - for cross-compilation: if you compile it on a system which doesn't - support Vale, it will compile it to a function which makes the - program exit. +WARNING: this function doesn't perform any dynamic hardware + check. You need to configure particular flags to control hardware + fetures that this function uses on target processor architecture. */ EverCrypt_Error_error_code EverCrypt_AEAD_decrypt_expand_aes256_gcm_no_check( @@ -2009,12 +2456,60 @@ EverCrypt_AEAD_decrypt_expand_aes256_gcm_no_check( return EverCrypt_Error_Success; } return EverCrypt_Error_AuthenticationFailure; + #elif HACL_CAN_COMPILE_AESNI_PCLMUL + uint8_t ek[352U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_256_GCM_NI_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_NI_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; #else - KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", - __FILE__, - __LINE__, - "EverCrypt was compiled on a system which doesn\'t support Vale"); - KRML_HOST_EXIT(255U); + uint8_t ek[1184U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_256_GCM_CT64_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_CT64_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; #endif } @@ -2031,12 +2526,12 @@ EverCrypt_AEAD_decrypt_expand_aes128_gcm( uint8_t *dst ) { + #if HACL_CAN_COMPILE_VALE bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t ek[480U] = { 0U }; @@ -2164,8 +2659,68 @@ EverCrypt_AEAD_decrypt_expand_aes128_gcm( } } } + else + #endif + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t ek[288U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_NI_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + } + else #endif - return EverCrypt_Error_UnsupportedAlgorithm; + { + uint8_t ek[928U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_128_GCM_CT64_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_CT64_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + } } EverCrypt_Error_error_code @@ -2181,12 +2736,12 @@ EverCrypt_AEAD_decrypt_expand_aes256_gcm( uint8_t *dst ) { + #if HACL_CAN_COMPILE_VALE bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t ek[544U] = { 0U }; @@ -2314,8 +2869,68 @@ EverCrypt_AEAD_decrypt_expand_aes256_gcm( } } } + else + #endif + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t ek[352U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_256_GCM_NI_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_NI_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + } + else #endif - return EverCrypt_Error_UnsupportedAlgorithm; + { + uint8_t ek[1184U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_256_GCM_CT64_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_CT64_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + } } EverCrypt_Error_error_code diff --git a/src/Hacl_AES_128_BitSlice.c b/src/Hacl_AES_128_CTR32_BitSlice.c similarity index 98% rename from src/Hacl_AES_128_BitSlice.c rename to src/Hacl_AES_128_CTR32_BitSlice.c index a0d2938f..6af2a7c0 100644 --- a/src/Hacl_AES_128_BitSlice.c +++ b/src/Hacl_AES_128_CTR32_BitSlice.c @@ -23,7 +23,7 @@ */ -#include "internal/Hacl_AES_128_BitSlice.h" +#include "internal/Hacl_AES_128_CTR32_BitSlice.h" #include "internal/Hacl_Lib.h" @@ -639,7 +639,7 @@ Hacl_Impl_AES_Generic_aes256_ctr_bitslice( } } -void Hacl_AES_128_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce) +void Hacl_AES_128_CTR32_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce) { uint64_t *kex = ctx + (uint32_t)8U; uint64_t *n = ctx; @@ -778,13 +778,13 @@ void Hacl_AES_128_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *non Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); } -void Hacl_AES_128_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce) +void Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce) { uint64_t *n = ctx; Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); } -void Hacl_AES_128_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter) +void Hacl_AES_128_CTR32_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter) { uint64_t *kex = ctx + (uint32_t)8U; uint64_t *n = ctx; @@ -806,7 +806,7 @@ void Hacl_AES_128_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t } inline void -Hacl_AES_128_BitSlice_aes128_ctr_encrypt( +Hacl_AES_128_CTR32_BitSlice_aes128_ctr_encrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -955,7 +955,7 @@ Hacl_AES_128_BitSlice_aes128_ctr_encrypt( } inline void -Hacl_AES_128_BitSlice_aes128_ctr_decrypt( +Hacl_AES_128_CTR32_BitSlice_aes128_ctr_decrypt( uint32_t len, uint8_t *out, uint8_t *inp, diff --git a/src/Hacl_AES_128_NI.c b/src/Hacl_AES_128_CTR32_NI.c similarity index 96% rename from src/Hacl_AES_128_NI.c rename to src/Hacl_AES_128_CTR32_NI.c index 4a9d9ca8..21b2f898 100644 --- a/src/Hacl_AES_128_NI.c +++ b/src/Hacl_AES_128_CTR32_NI.c @@ -23,10 +23,14 @@ */ -#include "Hacl_AES_128_NI.h" +#include "Hacl_AES_128_CTR32_NI.h" void -Hacl_AES_128_NI_aes128_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key, uint8_t *nonce) +Hacl_AES_128_CTR32_NI_aes128_init( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint8_t *key, + uint8_t *nonce +) { Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = ctx; @@ -278,7 +282,8 @@ Hacl_AES_128_NI_aes128_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key, n[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); } -void Hacl_AES_128_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce) +void +Hacl_AES_128_CTR32_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce) { Lib_IntVector_Intrinsics_vec128 *n = ctx; uint8_t nb[16U] = { 0U }; @@ -287,7 +292,7 @@ void Hacl_AES_128_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint } void -Hacl_AES_128_NI_aes128_key_block( +Hacl_AES_128_CTR32_NI_aes128_key_block( uint8_t *kb, Lib_IntVector_Intrinsics_vec128 *ctx, uint32_t counter @@ -296,14 +301,13 @@ Hacl_AES_128_NI_aes128_key_block( Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = ctx; KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; - uint32_t counter1 = counter; - uint32_t counter0 = htobe32(counter1); - uint32_t counter11 = htobe32(counter1 + (uint32_t)1U); - uint32_t counter2 = htobe32(counter1 + (uint32_t)2U); - uint32_t counter3 = htobe32(counter1 + (uint32_t)3U); + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); - st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter11, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); uint32_t klen = (uint32_t)1U; @@ -331,7 +335,7 @@ Hacl_AES_128_NI_aes128_key_block( } inline void -Hacl_AES_128_NI_aes128_ctr_encrypt( +Hacl_AES_128_CTR32_NI_aes128_ctr_encrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -598,11 +602,10 @@ Hacl_AES_128_NI_aes128_ctr_encrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n1 = ctx; - uint32_t counter = ctr; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -656,11 +659,10 @@ Hacl_AES_128_NI_aes128_ctr_encrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n1 = ctx; - uint32_t counter = ctr; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -707,7 +709,7 @@ Hacl_AES_128_NI_aes128_ctr_encrypt( } inline void -Hacl_AES_128_NI_aes128_ctr_decrypt( +Hacl_AES_128_CTR32_NI_aes128_ctr_decrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -974,11 +976,10 @@ Hacl_AES_128_NI_aes128_ctr_decrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n1 = ctx; - uint32_t counter = ctr; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -1032,11 +1033,10 @@ Hacl_AES_128_NI_aes128_ctr_decrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n1 = ctx; - uint32_t counter = ctr; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); diff --git a/src/Hacl_AES_128_GCM_CT64.c b/src/Hacl_AES_128_GCM_CT64.c new file mode 100644 index 00000000..1051af30 --- /dev/null +++ b/src/Hacl_AES_128_GCM_CT64.c @@ -0,0 +1,210 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_128_GCM_CT64.h" + +#include "internal/Hacl_AES_128_CTR32_BitSlice.h" + +uint32_t Hacl_AES_128_GCM_CT64_aes_gcm_ctx_len = (uint32_t)116U; + +void Hacl_AES_128_GCM_CT64_aes128_gcm_init(uint64_t *ctx, uint8_t *key) +{ + uint8_t gcm_key[16U] = { 0U }; + uint8_t nonce0[12U] = { 0U }; + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)96U; + Hacl_AES_128_CTR32_BitSlice_aes128_init(aes_ctx, key, nonce0); + Hacl_AES_128_CTR32_BitSlice_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Hacl_Gf128_CT64_gcm_init(gcm_ctx, gcm_key); +} + +void +Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t tmp[16U] = { 0U }; + uint8_t *cip = out; + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)96U; + uint64_t *tag_mix = ctx + (uint32_t)114U; + uint32_t ctr; + uint8_t tag_mix10[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + uint64_t *aes_ctx1 = ctx; + Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(aes_ctx1, iv); + Hacl_AES_128_CTR32_BitSlice_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); + uint64_t u = load64_le(tag_mix10); + ctx[114U] = u; + uint64_t u0 = load64_le(tag_mix10 + (uint32_t)8U); + ctx[115U] = u0; + ctr = (uint32_t)2U; + } + else + { + uint64_t *aes_ctx1 = ctx; + uint64_t *gcm_ctx1 = ctx + (uint32_t)96U; + store64_be(gcm_key + (uint32_t)8U, gcm_ctx1[8U]); + store64_be(gcm_key, gcm_ctx1[9U]); + Hacl_Gf128_CT64_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_CT64_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(aes_ctx1, tag_iv); + uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u0; + Hacl_AES_128_CTR32_BitSlice_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + uint64_t u = load64_le(tag_mix1); + ctx[114U] = u; + uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); + ctx[115U] = u1; + ctr = ctr0 + (uint32_t)1U; + } + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, cip, text, aes_ctx, ctr); + gcm_ctx[0U] = (uint64_t)0U; + gcm_ctx[1U] = (uint64_t)0U; + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, len, cip); + store64_be(tmp, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(tmp + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_CT64_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); + Hacl_Gf128_CT64_gcm_emit(tmp, gcm_ctx); + uint64_t u0 = load64_le(tmp); + uint64_t tmp0 = u0; + uint64_t u = load64_le(tmp + (uint32_t)8U); + uint64_t tmp1 = u; + uint64_t tmp01 = tmp0 ^ tag_mix[0U]; + uint64_t tmp11 = tmp1 ^ tag_mix[1U]; + store64_le(out + len, tmp01); + store64_le(out + len + (uint32_t)8U, tmp11); +} + +bool +Hacl_AES_128_GCM_CT64_aes128_gcm_decrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t scratch[18U] = { 0U }; + uint8_t *text = scratch; + uint8_t *result = scratch + (uint32_t)17U; + uint8_t *ciphertext = cipher; + uint8_t *tag = cipher + len; + uint32_t ctr; + uint8_t tag_mix0[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + uint64_t *aes_ctx = ctx; + Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(aes_ctx, iv); + Hacl_AES_128_CTR32_BitSlice_aes128_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + uint64_t u = load64_le(tag_mix0); + ctx[114U] = u; + uint64_t u0 = load64_le(tag_mix0 + (uint32_t)8U); + ctx[115U] = u0; + ctr = (uint32_t)2U; + } + else + { + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)96U; + store64_be(gcm_key + (uint32_t)8U, gcm_ctx[8U]); + store64_be(gcm_key, gcm_ctx[9U]); + Hacl_Gf128_CT64_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_CT64_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(aes_ctx, tag_iv); + uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u0; + Hacl_AES_128_CTR32_BitSlice_aes128_key_block(tag_mix1, aes_ctx, ctr0); + uint64_t u = load64_le(tag_mix1); + ctx[114U] = u; + uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); + ctx[115U] = u1; + ctr = ctr0 + (uint32_t)1U; + } + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)96U; + uint64_t *tag_mix = ctx + (uint32_t)114U; + gcm_ctx[0U] = (uint64_t)0U; + gcm_ctx[1U] = (uint64_t)0U; + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, len, ciphertext); + store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(text + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_CT64_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); + Hacl_Gf128_CT64_gcm_emit(text, gcm_ctx); + uint64_t u0 = load64_le(text); + uint64_t text0 = u0; + uint64_t u = load64_le(text + (uint32_t)8U); + uint64_t text1 = u; + uint64_t text01 = text0 ^ tag_mix[0U]; + uint64_t text11 = text1 ^ tag_mix[1U]; + store64_le(text, text01); + store64_le(text + (uint32_t)8U, text11); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + result[0U] = result[0U] | (text[i] ^ tag[i]);); + uint8_t res8 = result[0U]; + if (res8 == (uint8_t)0U) + { + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, out, ciphertext, aes_ctx, ctr); + return true; + } + return false; +} + diff --git a/src/Hacl_AES_128_GCM_NI.c b/src/Hacl_AES_128_GCM_NI.c index 16e03251..1884764e 100644 --- a/src/Hacl_AES_128_GCM_NI.c +++ b/src/Hacl_AES_128_GCM_NI.c @@ -30,9 +30,9 @@ void Hacl_AES_128_GCM_NI_aes128_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, u uint8_t gcm_key[16U] = { 0U }; uint8_t nonce0[12U] = { 0U }; Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; - Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; - Hacl_AES_128_NI_aes128_init(aes_ctx, key, nonce0); - Hacl_AES_128_NI_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)12U; + Hacl_AES_128_CTR32_NI_aes128_init(aes_ctx, key, nonce0); + Hacl_AES_128_CTR32_NI_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); Hacl_Gf128_NI_gcm_init(gcm_ctx, gcm_key); } @@ -57,15 +57,15 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( if (iv_len == (uint32_t)12U) { Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; - Hacl_AES_128_NI_aes128_set_nonce(aes_ctx, iv); - Hacl_AES_128_NI_aes128_key_block(tag_mix0, aes_ctx, (uint32_t)1U); - ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); + Hacl_AES_128_CTR32_NI_aes128_set_nonce(aes_ctx, iv); + Hacl_AES_128_CTR32_NI_aes128_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + ctx[17U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); ctr = (uint32_t)2U; } else { Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; - Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)12U; Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx[4U]); Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); @@ -75,11 +75,11 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( (uint32_t)1U, size_iv[i] = tag_iv[i] ^ size_iv[i];); Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); - Hacl_AES_128_NI_aes128_set_nonce(aes_ctx, tag_iv); + Hacl_AES_128_CTR32_NI_aes128_set_nonce(aes_ctx, tag_iv); uint32_t u = load32_be(tag_iv + (uint32_t)12U); uint32_t ctr0 = u; - Hacl_AES_128_NI_aes128_key_block(tag_mix1, aes_ctx, ctr0); - ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + Hacl_AES_128_CTR32_NI_aes128_key_block(tag_mix1, aes_ctx, ctr0); + ctx[17U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); ctr = ctr0 + (uint32_t)1U; } uint8_t *cip = out; @@ -93,11 +93,10 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; - uint32_t counter = ctr1; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr1); + uint32_t counter1 = htobe32(ctr1 + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr1 + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr1 + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -151,11 +150,10 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; - uint32_t counter = ctr1; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr1); + uint32_t counter1 = htobe32(ctr1 + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr1 + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr1 + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -199,8 +197,9 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); memcpy(ob, last, rem * sizeof (uint8_t)); } - Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; - Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[17U]; + gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, cip); uint8_t tmp[16U] = { 0U }; @@ -212,7 +211,6 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( Lib_IntVector_Intrinsics_vec128 tmp_vec1 = Lib_IntVector_Intrinsics_vec128_xor(tmp_vec, tag_mix); Lib_IntVector_Intrinsics_vec128_store128_le(out + len, tmp_vec1); - gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; } bool @@ -232,28 +230,25 @@ Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( uint8_t *result = scratch + (uint32_t)17U; uint8_t *ciphertext = cipher; uint8_t *tag = cipher + len; - Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; - Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; - Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; uint32_t ctr; - uint8_t tag_mix10[16U] = { 0U }; + uint8_t tag_mix0[16U] = { 0U }; uint8_t gcm_key[16U] = { 0U }; uint8_t tag_iv[16U] = { 0U }; uint8_t size_iv[16U] = { 0U }; uint8_t tag_mix1[16U] = { 0U }; if (iv_len == (uint32_t)12U) { - Lib_IntVector_Intrinsics_vec128 *aes_ctx1 = ctx; - Hacl_AES_128_NI_aes128_set_nonce(aes_ctx1, iv); - Hacl_AES_128_NI_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); - ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix10); + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Hacl_AES_128_CTR32_NI_aes128_set_nonce(aes_ctx, iv); + Hacl_AES_128_CTR32_NI_aes128_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + ctx[17U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); ctr = (uint32_t)2U; } else { - Lib_IntVector_Intrinsics_vec128 *aes_ctx1 = ctx; - Lib_IntVector_Intrinsics_vec128 *gcm_ctx1 = ctx + (uint32_t)16U; - Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx1[4U]); + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx[4U]); Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); KRML_MAYBE_FOR16(i, @@ -262,13 +257,17 @@ Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( (uint32_t)1U, size_iv[i] = tag_iv[i] ^ size_iv[i];); Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); - Hacl_AES_128_NI_aes128_set_nonce(aes_ctx1, tag_iv); + Hacl_AES_128_CTR32_NI_aes128_set_nonce(aes_ctx, tag_iv); uint32_t u = load32_be(tag_iv + (uint32_t)12U); uint32_t ctr0 = u; - Hacl_AES_128_NI_aes128_key_block(tag_mix1, aes_ctx1, ctr0); - ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + Hacl_AES_128_CTR32_NI_aes128_key_block(tag_mix1, aes_ctx, ctr0); + ctx[17U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); ctr = ctr0 + (uint32_t)1U; } + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[17U]; + gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, ciphertext); store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); @@ -296,11 +295,10 @@ Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; - uint32_t counter = ctr1; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr1); + uint32_t counter1 = htobe32(ctr1 + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr1 + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr1 + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -354,11 +352,10 @@ Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; - uint32_t counter = ctr1; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr1); + uint32_t counter1 = htobe32(ctr1 + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr1 + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr1 + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); diff --git a/src/Hacl_AES_256_CTR32_BitSlice.c b/src/Hacl_AES_256_CTR32_BitSlice.c new file mode 100644 index 00000000..461e3153 --- /dev/null +++ b/src/Hacl_AES_256_CTR32_BitSlice.c @@ -0,0 +1,634 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_256_CTR32_BitSlice.h" + +#include "internal/Hacl_AES_128_CTR32_BitSlice.h" + +void Hacl_AES_256_CTR32_BitSlice_aes256_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce) +{ + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + uint32_t klen = (uint32_t)8U; + uint64_t *next0 = kex; + uint64_t *next1 = kex + klen; + Hacl_Impl_AES_CoreBitSlice_load_key1(next0, key); + Hacl_Impl_AES_CoreBitSlice_load_key1(next1, key + (uint32_t)16U); + uint64_t *prev0 = next0; + uint64_t *prev1 = next1; + uint64_t *next01 = kex + klen * (uint32_t)2U; + uint64_t *next11 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next01, prev1, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next01[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next01[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next01, prev0); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next11, next01, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next11[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next11[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next11, prev1); + uint64_t *prev01 = next01; + uint64_t *prev11 = next11; + uint64_t *next02 = kex + klen * (uint32_t)4U; + uint64_t *next12 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next02, prev11, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next02[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next02[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next02, prev01); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next12, next02, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next12[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next12[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next12, prev11); + uint64_t *prev02 = next02; + uint64_t *prev12 = next12; + uint64_t *next03 = kex + klen * (uint32_t)6U; + uint64_t *next13 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next03, prev12, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next03[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next03[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next03, prev02); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next13, next03, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next13[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next13[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next13, prev12); + uint64_t *prev03 = next03; + uint64_t *prev13 = next13; + uint64_t *next04 = kex + klen * (uint32_t)8U; + uint64_t *next14 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next04, prev13, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next04[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next04[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next04, prev03); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next14, next04, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next14[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next14[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next14, prev13); + uint64_t *prev04 = next04; + uint64_t *prev14 = next14; + uint64_t *next05 = kex + klen * (uint32_t)10U; + uint64_t *next15 = kex + klen * (uint32_t)11U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next05, prev14, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next05[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next05[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next05, prev04); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next15, next05, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next15[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next15[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next15, prev14); + uint64_t *prev05 = next05; + uint64_t *prev15 = next15; + uint64_t *next06 = kex + klen * (uint32_t)12U; + uint64_t *next16 = kex + klen * (uint32_t)13U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next06, prev15, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next06[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next06[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next06, prev05); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next16, next06, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next16[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next16[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next16, prev15); + uint64_t *prev06 = next06; + uint64_t *prev16 = next16; + uint64_t *next07 = kex + klen * (uint32_t)14U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next07, prev16, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next07[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next07[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next07, prev06); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); +} + +void Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(uint64_t *ctx, uint8_t *nonce) +{ + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); +} + +void Hacl_AES_256_CTR32_BitSlice_aes256_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter) +{ + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + uint64_t st[8U] = { 0U }; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, counter); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)14U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + uint64_t *sub_key = kr + i * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + Hacl_Impl_AES_CoreBitSlice_store_block0(kb, st); +} + +void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t c +) +{ + Hacl_Impl_AES_Generic_aes256_ctr_bitslice(len, out, inp, ctx, c); +} + +inline void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + uint64_t ctx[128U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n1 = ctx; + uint32_t klen = (uint32_t)8U; + uint64_t *next0 = kex; + uint64_t *next1 = kex + klen; + Hacl_Impl_AES_CoreBitSlice_load_key1(next0, k); + Hacl_Impl_AES_CoreBitSlice_load_key1(next1, k + (uint32_t)16U); + uint64_t *prev0 = next0; + uint64_t *prev1 = next1; + uint64_t *next01 = kex + klen * (uint32_t)2U; + uint64_t *next11 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next01, prev1, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next01[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next01[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next01, prev0); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next11, next01, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next11[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next11[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next11, prev1); + uint64_t *prev01 = next01; + uint64_t *prev11 = next11; + uint64_t *next02 = kex + klen * (uint32_t)4U; + uint64_t *next12 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next02, prev11, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next02[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next02[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next02, prev01); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next12, next02, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next12[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next12[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next12, prev11); + uint64_t *prev02 = next02; + uint64_t *prev12 = next12; + uint64_t *next03 = kex + klen * (uint32_t)6U; + uint64_t *next13 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next03, prev12, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next03[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next03[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next03, prev02); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next13, next03, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next13[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next13[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next13, prev12); + uint64_t *prev03 = next03; + uint64_t *prev13 = next13; + uint64_t *next04 = kex + klen * (uint32_t)8U; + uint64_t *next14 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next04, prev13, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next04[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next04[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next04, prev03); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next14, next04, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next14[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next14[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next14, prev13); + uint64_t *prev04 = next04; + uint64_t *prev14 = next14; + uint64_t *next05 = kex + klen * (uint32_t)10U; + uint64_t *next15 = kex + klen * (uint32_t)11U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next05, prev14, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next05[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next05[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next05, prev04); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next15, next05, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next15[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next15[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next15, prev14); + uint64_t *prev05 = next05; + uint64_t *prev15 = next15; + uint64_t *next06 = kex + klen * (uint32_t)12U; + uint64_t *next16 = kex + klen * (uint32_t)13U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next06, prev15, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next06[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next06[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next06, prev05); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next16, next06, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next16[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next16[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next16, prev15); + uint64_t *prev06 = next06; + uint64_t *prev16 = next16; + uint64_t *next07 = kex + klen * (uint32_t)14U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next07, prev16, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next07[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next07[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next07, prev06); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n1, n); + Hacl_Impl_AES_Generic_aes256_ctr_bitslice(len, out, inp, ctx, c); +} + +inline void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + uint64_t ctx[128U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n1 = ctx; + uint32_t klen = (uint32_t)8U; + uint64_t *next0 = kex; + uint64_t *next1 = kex + klen; + Hacl_Impl_AES_CoreBitSlice_load_key1(next0, k); + Hacl_Impl_AES_CoreBitSlice_load_key1(next1, k + (uint32_t)16U); + uint64_t *prev0 = next0; + uint64_t *prev1 = next1; + uint64_t *next01 = kex + klen * (uint32_t)2U; + uint64_t *next11 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next01, prev1, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next01[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next01[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next01, prev0); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next11, next01, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next11[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next11[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next11, prev1); + uint64_t *prev01 = next01; + uint64_t *prev11 = next11; + uint64_t *next02 = kex + klen * (uint32_t)4U; + uint64_t *next12 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next02, prev11, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next02[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next02[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next02, prev01); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next12, next02, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next12[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next12[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next12, prev11); + uint64_t *prev02 = next02; + uint64_t *prev12 = next12; + uint64_t *next03 = kex + klen * (uint32_t)6U; + uint64_t *next13 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next03, prev12, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next03[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next03[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next03, prev02); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next13, next03, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next13[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next13[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next13, prev12); + uint64_t *prev03 = next03; + uint64_t *prev13 = next13; + uint64_t *next04 = kex + klen * (uint32_t)8U; + uint64_t *next14 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next04, prev13, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next04[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next04[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next04, prev03); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next14, next04, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next14[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next14[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next14, prev13); + uint64_t *prev04 = next04; + uint64_t *prev14 = next14; + uint64_t *next05 = kex + klen * (uint32_t)10U; + uint64_t *next15 = kex + klen * (uint32_t)11U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next05, prev14, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next05[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next05[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next05, prev04); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next15, next05, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next15[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next15[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next15, prev14); + uint64_t *prev05 = next05; + uint64_t *prev15 = next15; + uint64_t *next06 = kex + klen * (uint32_t)12U; + uint64_t *next16 = kex + klen * (uint32_t)13U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next06, prev15, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next06[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next06[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next06, prev05); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next16, next06, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next16[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next16[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next16, prev15); + uint64_t *prev06 = next06; + uint64_t *prev16 = next16; + uint64_t *next07 = kex + klen * (uint32_t)14U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next07, prev16, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next07[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next07[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next07, prev06); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n1, n); + Hacl_Impl_AES_Generic_aes256_ctr_bitslice(len, out, inp, ctx, c); +} + diff --git a/src/Hacl_AES_256_CTR32_NI.c b/src/Hacl_AES_256_CTR32_NI.c new file mode 100644 index 00000000..81f94996 --- /dev/null +++ b/src/Hacl_AES_256_CTR32_NI.c @@ -0,0 +1,1433 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_256_CTR32_NI.h" + +void +Hacl_AES_256_CTR32_NI_aes256_init( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint8_t *key, + uint8_t *nonce +) +{ + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *next0 = kex; + Lib_IntVector_Intrinsics_vec128 *next1 = kex + klen; + next0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(key); + next1[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(key + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 *prev0 = next0; + Lib_IntVector_Intrinsics_vec128 *prev1 = next1; + Lib_IntVector_Intrinsics_vec128 *next01 = kex + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next11 = kex + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x01U); + next01[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key1 = prev0[0U]; + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key4 = + Lib_IntVector_Intrinsics_vec128_xor(key3, + Lib_IntVector_Intrinsics_vec128_shift_left(key3, (uint32_t)32U)); + next01[0U] = Lib_IntVector_Intrinsics_vec128_xor(next01[0U], key4); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next01[0U], (uint8_t)0U); + next11[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v1, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key10 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key40 = + Lib_IntVector_Intrinsics_vec128_xor(key30, + Lib_IntVector_Intrinsics_vec128_shift_left(key30, (uint32_t)32U)); + next11[0U] = Lib_IntVector_Intrinsics_vec128_xor(next11[0U], key40); + Lib_IntVector_Intrinsics_vec128 *prev01 = next01; + Lib_IntVector_Intrinsics_vec128 *prev11 = next11; + Lib_IntVector_Intrinsics_vec128 *next02 = kex + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next12 = kex + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev11[0U], (uint8_t)0x02U); + next02[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v2, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key11 = prev01[0U]; + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key41 = + Lib_IntVector_Intrinsics_vec128_xor(key31, + Lib_IntVector_Intrinsics_vec128_shift_left(key31, (uint32_t)32U)); + next02[0U] = Lib_IntVector_Intrinsics_vec128_xor(next02[0U], key41); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next02[0U], (uint8_t)0U); + next12[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v3, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key12 = prev11[0U]; + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key42 = + Lib_IntVector_Intrinsics_vec128_xor(key32, + Lib_IntVector_Intrinsics_vec128_shift_left(key32, (uint32_t)32U)); + next12[0U] = Lib_IntVector_Intrinsics_vec128_xor(next12[0U], key42); + Lib_IntVector_Intrinsics_vec128 *prev02 = next02; + Lib_IntVector_Intrinsics_vec128 *prev12 = next12; + Lib_IntVector_Intrinsics_vec128 *next03 = kex + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next13 = kex + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev12[0U], (uint8_t)0x04U); + next03[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key13 = prev02[0U]; + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key43 = + Lib_IntVector_Intrinsics_vec128_xor(key33, + Lib_IntVector_Intrinsics_vec128_shift_left(key33, (uint32_t)32U)); + next03[0U] = Lib_IntVector_Intrinsics_vec128_xor(next03[0U], key43); + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next03[0U], (uint8_t)0U); + next13[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key14 = prev12[0U]; + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key44 = + Lib_IntVector_Intrinsics_vec128_xor(key34, + Lib_IntVector_Intrinsics_vec128_shift_left(key34, (uint32_t)32U)); + next13[0U] = Lib_IntVector_Intrinsics_vec128_xor(next13[0U], key44); + Lib_IntVector_Intrinsics_vec128 *prev03 = next03; + Lib_IntVector_Intrinsics_vec128 *prev13 = next13; + Lib_IntVector_Intrinsics_vec128 *next04 = kex + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next14 = kex + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev13[0U], (uint8_t)0x08U); + next04[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key15 = prev03[0U]; + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key45 = + Lib_IntVector_Intrinsics_vec128_xor(key35, + Lib_IntVector_Intrinsics_vec128_shift_left(key35, (uint32_t)32U)); + next04[0U] = Lib_IntVector_Intrinsics_vec128_xor(next04[0U], key45); + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next04[0U], (uint8_t)0U); + next14[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key16 = prev13[0U]; + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key46 = + Lib_IntVector_Intrinsics_vec128_xor(key36, + Lib_IntVector_Intrinsics_vec128_shift_left(key36, (uint32_t)32U)); + next14[0U] = Lib_IntVector_Intrinsics_vec128_xor(next14[0U], key46); + Lib_IntVector_Intrinsics_vec128 *prev04 = next04; + Lib_IntVector_Intrinsics_vec128 *prev14 = next14; + Lib_IntVector_Intrinsics_vec128 *next05 = kex + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 *next15 = kex + klen * (uint32_t)11U; + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev14[0U], (uint8_t)0x10U); + next05[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev04[0U]; + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key47 = + Lib_IntVector_Intrinsics_vec128_xor(key37, + Lib_IntVector_Intrinsics_vec128_shift_left(key37, (uint32_t)32U)); + next05[0U] = Lib_IntVector_Intrinsics_vec128_xor(next05[0U], key47); + Lib_IntVector_Intrinsics_vec128 + v9 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next05[0U], (uint8_t)0U); + next15[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v9, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key18 = prev14[0U]; + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key48 = + Lib_IntVector_Intrinsics_vec128_xor(key38, + Lib_IntVector_Intrinsics_vec128_shift_left(key38, (uint32_t)32U)); + next15[0U] = Lib_IntVector_Intrinsics_vec128_xor(next15[0U], key48); + Lib_IntVector_Intrinsics_vec128 *prev05 = next05; + Lib_IntVector_Intrinsics_vec128 *prev15 = next15; + Lib_IntVector_Intrinsics_vec128 *next06 = kex + klen * (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128 *next16 = kex + klen * (uint32_t)13U; + Lib_IntVector_Intrinsics_vec128 + v10 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev15[0U], (uint8_t)0x20U); + next06[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v10, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key19 = prev05[0U]; + Lib_IntVector_Intrinsics_vec128 + key29 = + Lib_IntVector_Intrinsics_vec128_xor(key19, + Lib_IntVector_Intrinsics_vec128_shift_left(key19, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key39 = + Lib_IntVector_Intrinsics_vec128_xor(key29, + Lib_IntVector_Intrinsics_vec128_shift_left(key29, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key49 = + Lib_IntVector_Intrinsics_vec128_xor(key39, + Lib_IntVector_Intrinsics_vec128_shift_left(key39, (uint32_t)32U)); + next06[0U] = Lib_IntVector_Intrinsics_vec128_xor(next06[0U], key49); + Lib_IntVector_Intrinsics_vec128 + v11 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next06[0U], (uint8_t)0U); + next16[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v11, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key110 = prev15[0U]; + Lib_IntVector_Intrinsics_vec128 + key210 = + Lib_IntVector_Intrinsics_vec128_xor(key110, + Lib_IntVector_Intrinsics_vec128_shift_left(key110, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key310 = + Lib_IntVector_Intrinsics_vec128_xor(key210, + Lib_IntVector_Intrinsics_vec128_shift_left(key210, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key410 = + Lib_IntVector_Intrinsics_vec128_xor(key310, + Lib_IntVector_Intrinsics_vec128_shift_left(key310, (uint32_t)32U)); + next16[0U] = Lib_IntVector_Intrinsics_vec128_xor(next16[0U], key410); + Lib_IntVector_Intrinsics_vec128 *prev06 = next06; + Lib_IntVector_Intrinsics_vec128 *prev16 = next16; + Lib_IntVector_Intrinsics_vec128 *next07 = kex + klen * (uint32_t)14U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev16[0U], (uint8_t)0x40U); + next07[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key111 = prev06[0U]; + Lib_IntVector_Intrinsics_vec128 + key211 = + Lib_IntVector_Intrinsics_vec128_xor(key111, + Lib_IntVector_Intrinsics_vec128_shift_left(key111, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key311 = + Lib_IntVector_Intrinsics_vec128_xor(key211, + Lib_IntVector_Intrinsics_vec128_shift_left(key211, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key411 = + Lib_IntVector_Intrinsics_vec128_xor(key311, + Lib_IntVector_Intrinsics_vec128_shift_left(key311, (uint32_t)32U)); + next07[0U] = Lib_IntVector_Intrinsics_vec128_xor(next07[0U], key411); + uint8_t nb[16U] = { 0U }; + memcpy(nb, nonce, (uint32_t)12U * sizeof (uint8_t)); + n[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); +} + +void +Hacl_AES_256_CTR32_NI_aes256_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce) +{ + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint8_t nb[16U] = { 0U }; + memcpy(nb, nonce, (uint32_t)12U * sizeof (uint8_t)); + n[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); +} + +void +Hacl_AES_256_CTR32_NI_aes256_key_block( + uint8_t *kb, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t counter +) +{ + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128_store128_le(kb, st[0U]); +} + +void +Hacl_AES_256_CTR32_NI_aes256_ctr( + uint32_t len, + uint8_t *out, + uint8_t *inp, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t c +) +{ + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = c + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i0, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = c + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + +inline void +Hacl_AES_256_CTR32_NI_aes256_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 ctx[16U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex0 = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n10 = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *next0 = kex0; + Lib_IntVector_Intrinsics_vec128 *next1 = kex0 + klen; + next0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k); + next1[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 *prev0 = next0; + Lib_IntVector_Intrinsics_vec128 *prev1 = next1; + Lib_IntVector_Intrinsics_vec128 *next01 = kex0 + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next11 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x01U); + next01[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key = prev0[0U]; + Lib_IntVector_Intrinsics_vec128 + key1 = + Lib_IntVector_Intrinsics_vec128_xor(key, + Lib_IntVector_Intrinsics_vec128_shift_left(key, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + next01[0U] = Lib_IntVector_Intrinsics_vec128_xor(next01[0U], key3); + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next01[0U], (uint8_t)0U); + next11[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key0 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key10 = + Lib_IntVector_Intrinsics_vec128_xor(key0, + Lib_IntVector_Intrinsics_vec128_shift_left(key0, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + next11[0U] = Lib_IntVector_Intrinsics_vec128_xor(next11[0U], key30); + Lib_IntVector_Intrinsics_vec128 *prev01 = next01; + Lib_IntVector_Intrinsics_vec128 *prev11 = next11; + Lib_IntVector_Intrinsics_vec128 *next02 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next12 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev11[0U], (uint8_t)0x02U); + next02[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key4 = prev01[0U]; + Lib_IntVector_Intrinsics_vec128 + key11 = + Lib_IntVector_Intrinsics_vec128_xor(key4, + Lib_IntVector_Intrinsics_vec128_shift_left(key4, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + next02[0U] = Lib_IntVector_Intrinsics_vec128_xor(next02[0U], key31); + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next02[0U], (uint8_t)0U); + next12[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key5 = prev11[0U]; + Lib_IntVector_Intrinsics_vec128 + key12 = + Lib_IntVector_Intrinsics_vec128_xor(key5, + Lib_IntVector_Intrinsics_vec128_shift_left(key5, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + next12[0U] = Lib_IntVector_Intrinsics_vec128_xor(next12[0U], key32); + Lib_IntVector_Intrinsics_vec128 *prev02 = next02; + Lib_IntVector_Intrinsics_vec128 *prev12 = next12; + Lib_IntVector_Intrinsics_vec128 *next03 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next13 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev12[0U], (uint8_t)0x04U); + next03[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key6 = prev02[0U]; + Lib_IntVector_Intrinsics_vec128 + key13 = + Lib_IntVector_Intrinsics_vec128_xor(key6, + Lib_IntVector_Intrinsics_vec128_shift_left(key6, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + next03[0U] = Lib_IntVector_Intrinsics_vec128_xor(next03[0U], key33); + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next03[0U], (uint8_t)0U); + next13[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key7 = prev12[0U]; + Lib_IntVector_Intrinsics_vec128 + key14 = + Lib_IntVector_Intrinsics_vec128_xor(key7, + Lib_IntVector_Intrinsics_vec128_shift_left(key7, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + next13[0U] = Lib_IntVector_Intrinsics_vec128_xor(next13[0U], key34); + Lib_IntVector_Intrinsics_vec128 *prev03 = next03; + Lib_IntVector_Intrinsics_vec128 *prev13 = next13; + Lib_IntVector_Intrinsics_vec128 *next04 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next14 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v9 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev13[0U], (uint8_t)0x08U); + next04[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v9, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key8 = prev03[0U]; + Lib_IntVector_Intrinsics_vec128 + key15 = + Lib_IntVector_Intrinsics_vec128_xor(key8, + Lib_IntVector_Intrinsics_vec128_shift_left(key8, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + next04[0U] = Lib_IntVector_Intrinsics_vec128_xor(next04[0U], key35); + Lib_IntVector_Intrinsics_vec128 + v10 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next04[0U], (uint8_t)0U); + next14[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v10, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key9 = prev13[0U]; + Lib_IntVector_Intrinsics_vec128 + key16 = + Lib_IntVector_Intrinsics_vec128_xor(key9, + Lib_IntVector_Intrinsics_vec128_shift_left(key9, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + next14[0U] = Lib_IntVector_Intrinsics_vec128_xor(next14[0U], key36); + Lib_IntVector_Intrinsics_vec128 *prev04 = next04; + Lib_IntVector_Intrinsics_vec128 *prev14 = next14; + Lib_IntVector_Intrinsics_vec128 *next05 = kex0 + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 *next15 = kex0 + klen * (uint32_t)11U; + Lib_IntVector_Intrinsics_vec128 + v12 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev14[0U], (uint8_t)0x10U); + next05[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v12, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev04[0U]; + Lib_IntVector_Intrinsics_vec128 + key18 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + next05[0U] = Lib_IntVector_Intrinsics_vec128_xor(next05[0U], key37); + Lib_IntVector_Intrinsics_vec128 + v13 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next05[0U], (uint8_t)0U); + next15[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v13, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key19 = prev14[0U]; + Lib_IntVector_Intrinsics_vec128 + key110 = + Lib_IntVector_Intrinsics_vec128_xor(key19, + Lib_IntVector_Intrinsics_vec128_shift_left(key19, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key110, + Lib_IntVector_Intrinsics_vec128_shift_left(key110, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + next15[0U] = Lib_IntVector_Intrinsics_vec128_xor(next15[0U], key38); + Lib_IntVector_Intrinsics_vec128 *prev05 = next05; + Lib_IntVector_Intrinsics_vec128 *prev15 = next15; + Lib_IntVector_Intrinsics_vec128 *next06 = kex0 + klen * (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128 *next16 = kex0 + klen * (uint32_t)13U; + Lib_IntVector_Intrinsics_vec128 + v14 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev15[0U], (uint8_t)0x20U); + next06[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v14, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key29 = prev05[0U]; + Lib_IntVector_Intrinsics_vec128 + key111 = + Lib_IntVector_Intrinsics_vec128_xor(key29, + Lib_IntVector_Intrinsics_vec128_shift_left(key29, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key210 = + Lib_IntVector_Intrinsics_vec128_xor(key111, + Lib_IntVector_Intrinsics_vec128_shift_left(key111, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key39 = + Lib_IntVector_Intrinsics_vec128_xor(key210, + Lib_IntVector_Intrinsics_vec128_shift_left(key210, (uint32_t)32U)); + next06[0U] = Lib_IntVector_Intrinsics_vec128_xor(next06[0U], key39); + Lib_IntVector_Intrinsics_vec128 + v15 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next06[0U], (uint8_t)0U); + next16[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v15, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key40 = prev15[0U]; + Lib_IntVector_Intrinsics_vec128 + key112 = + Lib_IntVector_Intrinsics_vec128_xor(key40, + Lib_IntVector_Intrinsics_vec128_shift_left(key40, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key211 = + Lib_IntVector_Intrinsics_vec128_xor(key112, + Lib_IntVector_Intrinsics_vec128_shift_left(key112, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key310 = + Lib_IntVector_Intrinsics_vec128_xor(key211, + Lib_IntVector_Intrinsics_vec128_shift_left(key211, (uint32_t)32U)); + next16[0U] = Lib_IntVector_Intrinsics_vec128_xor(next16[0U], key310); + Lib_IntVector_Intrinsics_vec128 *prev06 = next06; + Lib_IntVector_Intrinsics_vec128 *prev16 = next16; + Lib_IntVector_Intrinsics_vec128 *next07 = kex0 + klen * (uint32_t)14U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev16[0U], (uint8_t)0x40U); + next07[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key41 = prev06[0U]; + Lib_IntVector_Intrinsics_vec128 + key113 = + Lib_IntVector_Intrinsics_vec128_xor(key41, + Lib_IntVector_Intrinsics_vec128_shift_left(key41, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key212 = + Lib_IntVector_Intrinsics_vec128_xor(key113, + Lib_IntVector_Intrinsics_vec128_shift_left(key113, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key311 = + Lib_IntVector_Intrinsics_vec128_xor(key212, + Lib_IntVector_Intrinsics_vec128_shift_left(key212, (uint32_t)32U)); + next07[0U] = Lib_IntVector_Intrinsics_vec128_xor(next07[0U], key311); + uint8_t nb[16U] = { 0U }; + memcpy(nb, n, (uint32_t)12U * sizeof (uint8_t)); + n10[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = c + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i0, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = c + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + +inline void +Hacl_AES_256_CTR32_NI_aes256_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 ctx[16U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex0 = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n10 = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *next0 = kex0; + Lib_IntVector_Intrinsics_vec128 *next1 = kex0 + klen; + next0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k); + next1[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 *prev0 = next0; + Lib_IntVector_Intrinsics_vec128 *prev1 = next1; + Lib_IntVector_Intrinsics_vec128 *next01 = kex0 + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next11 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x01U); + next01[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key = prev0[0U]; + Lib_IntVector_Intrinsics_vec128 + key1 = + Lib_IntVector_Intrinsics_vec128_xor(key, + Lib_IntVector_Intrinsics_vec128_shift_left(key, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + next01[0U] = Lib_IntVector_Intrinsics_vec128_xor(next01[0U], key3); + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next01[0U], (uint8_t)0U); + next11[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key0 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key10 = + Lib_IntVector_Intrinsics_vec128_xor(key0, + Lib_IntVector_Intrinsics_vec128_shift_left(key0, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + next11[0U] = Lib_IntVector_Intrinsics_vec128_xor(next11[0U], key30); + Lib_IntVector_Intrinsics_vec128 *prev01 = next01; + Lib_IntVector_Intrinsics_vec128 *prev11 = next11; + Lib_IntVector_Intrinsics_vec128 *next02 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next12 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev11[0U], (uint8_t)0x02U); + next02[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key4 = prev01[0U]; + Lib_IntVector_Intrinsics_vec128 + key11 = + Lib_IntVector_Intrinsics_vec128_xor(key4, + Lib_IntVector_Intrinsics_vec128_shift_left(key4, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + next02[0U] = Lib_IntVector_Intrinsics_vec128_xor(next02[0U], key31); + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next02[0U], (uint8_t)0U); + next12[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key5 = prev11[0U]; + Lib_IntVector_Intrinsics_vec128 + key12 = + Lib_IntVector_Intrinsics_vec128_xor(key5, + Lib_IntVector_Intrinsics_vec128_shift_left(key5, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + next12[0U] = Lib_IntVector_Intrinsics_vec128_xor(next12[0U], key32); + Lib_IntVector_Intrinsics_vec128 *prev02 = next02; + Lib_IntVector_Intrinsics_vec128 *prev12 = next12; + Lib_IntVector_Intrinsics_vec128 *next03 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next13 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev12[0U], (uint8_t)0x04U); + next03[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key6 = prev02[0U]; + Lib_IntVector_Intrinsics_vec128 + key13 = + Lib_IntVector_Intrinsics_vec128_xor(key6, + Lib_IntVector_Intrinsics_vec128_shift_left(key6, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + next03[0U] = Lib_IntVector_Intrinsics_vec128_xor(next03[0U], key33); + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next03[0U], (uint8_t)0U); + next13[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key7 = prev12[0U]; + Lib_IntVector_Intrinsics_vec128 + key14 = + Lib_IntVector_Intrinsics_vec128_xor(key7, + Lib_IntVector_Intrinsics_vec128_shift_left(key7, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + next13[0U] = Lib_IntVector_Intrinsics_vec128_xor(next13[0U], key34); + Lib_IntVector_Intrinsics_vec128 *prev03 = next03; + Lib_IntVector_Intrinsics_vec128 *prev13 = next13; + Lib_IntVector_Intrinsics_vec128 *next04 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next14 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v9 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev13[0U], (uint8_t)0x08U); + next04[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v9, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key8 = prev03[0U]; + Lib_IntVector_Intrinsics_vec128 + key15 = + Lib_IntVector_Intrinsics_vec128_xor(key8, + Lib_IntVector_Intrinsics_vec128_shift_left(key8, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + next04[0U] = Lib_IntVector_Intrinsics_vec128_xor(next04[0U], key35); + Lib_IntVector_Intrinsics_vec128 + v10 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next04[0U], (uint8_t)0U); + next14[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v10, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key9 = prev13[0U]; + Lib_IntVector_Intrinsics_vec128 + key16 = + Lib_IntVector_Intrinsics_vec128_xor(key9, + Lib_IntVector_Intrinsics_vec128_shift_left(key9, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + next14[0U] = Lib_IntVector_Intrinsics_vec128_xor(next14[0U], key36); + Lib_IntVector_Intrinsics_vec128 *prev04 = next04; + Lib_IntVector_Intrinsics_vec128 *prev14 = next14; + Lib_IntVector_Intrinsics_vec128 *next05 = kex0 + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 *next15 = kex0 + klen * (uint32_t)11U; + Lib_IntVector_Intrinsics_vec128 + v12 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev14[0U], (uint8_t)0x10U); + next05[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v12, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev04[0U]; + Lib_IntVector_Intrinsics_vec128 + key18 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + next05[0U] = Lib_IntVector_Intrinsics_vec128_xor(next05[0U], key37); + Lib_IntVector_Intrinsics_vec128 + v13 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next05[0U], (uint8_t)0U); + next15[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v13, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key19 = prev14[0U]; + Lib_IntVector_Intrinsics_vec128 + key110 = + Lib_IntVector_Intrinsics_vec128_xor(key19, + Lib_IntVector_Intrinsics_vec128_shift_left(key19, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key110, + Lib_IntVector_Intrinsics_vec128_shift_left(key110, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + next15[0U] = Lib_IntVector_Intrinsics_vec128_xor(next15[0U], key38); + Lib_IntVector_Intrinsics_vec128 *prev05 = next05; + Lib_IntVector_Intrinsics_vec128 *prev15 = next15; + Lib_IntVector_Intrinsics_vec128 *next06 = kex0 + klen * (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128 *next16 = kex0 + klen * (uint32_t)13U; + Lib_IntVector_Intrinsics_vec128 + v14 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev15[0U], (uint8_t)0x20U); + next06[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v14, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key29 = prev05[0U]; + Lib_IntVector_Intrinsics_vec128 + key111 = + Lib_IntVector_Intrinsics_vec128_xor(key29, + Lib_IntVector_Intrinsics_vec128_shift_left(key29, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key210 = + Lib_IntVector_Intrinsics_vec128_xor(key111, + Lib_IntVector_Intrinsics_vec128_shift_left(key111, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key39 = + Lib_IntVector_Intrinsics_vec128_xor(key210, + Lib_IntVector_Intrinsics_vec128_shift_left(key210, (uint32_t)32U)); + next06[0U] = Lib_IntVector_Intrinsics_vec128_xor(next06[0U], key39); + Lib_IntVector_Intrinsics_vec128 + v15 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next06[0U], (uint8_t)0U); + next16[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v15, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key40 = prev15[0U]; + Lib_IntVector_Intrinsics_vec128 + key112 = + Lib_IntVector_Intrinsics_vec128_xor(key40, + Lib_IntVector_Intrinsics_vec128_shift_left(key40, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key211 = + Lib_IntVector_Intrinsics_vec128_xor(key112, + Lib_IntVector_Intrinsics_vec128_shift_left(key112, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key310 = + Lib_IntVector_Intrinsics_vec128_xor(key211, + Lib_IntVector_Intrinsics_vec128_shift_left(key211, (uint32_t)32U)); + next16[0U] = Lib_IntVector_Intrinsics_vec128_xor(next16[0U], key310); + Lib_IntVector_Intrinsics_vec128 *prev06 = next06; + Lib_IntVector_Intrinsics_vec128 *prev16 = next16; + Lib_IntVector_Intrinsics_vec128 *next07 = kex0 + klen * (uint32_t)14U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev16[0U], (uint8_t)0x40U); + next07[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key41 = prev06[0U]; + Lib_IntVector_Intrinsics_vec128 + key113 = + Lib_IntVector_Intrinsics_vec128_xor(key41, + Lib_IntVector_Intrinsics_vec128_shift_left(key41, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key212 = + Lib_IntVector_Intrinsics_vec128_xor(key113, + Lib_IntVector_Intrinsics_vec128_shift_left(key113, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key311 = + Lib_IntVector_Intrinsics_vec128_xor(key212, + Lib_IntVector_Intrinsics_vec128_shift_left(key212, (uint32_t)32U)); + next07[0U] = Lib_IntVector_Intrinsics_vec128_xor(next07[0U], key311); + uint8_t nb[16U] = { 0U }; + memcpy(nb, n, (uint32_t)12U * sizeof (uint8_t)); + n10[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = c + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i0, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = c + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + diff --git a/src/Hacl_AES_128_GCM_M32.c b/src/Hacl_AES_256_GCM_CT64.c similarity index 64% rename from src/Hacl_AES_128_GCM_M32.c rename to src/Hacl_AES_256_GCM_CT64.c index bd172a0e..436f2318 100644 --- a/src/Hacl_AES_128_GCM_M32.c +++ b/src/Hacl_AES_256_GCM_CT64.c @@ -23,25 +23,23 @@ */ -#include "Hacl_AES_128_GCM_M32.h" +#include "Hacl_AES_256_GCM_CT64.h" -#include "internal/Hacl_AES_128_BitSlice.h" +uint32_t Hacl_AES_256_GCM_CT64_aes_gcm_ctx_len = (uint32_t)148U; -uint32_t Hacl_AES_128_GCM_M32_aes_gcm_ctx_len = (uint32_t)396U; - -void Hacl_AES_128_GCM_M32_aes128_gcm_init(uint64_t *ctx, uint8_t *key) +void Hacl_AES_256_GCM_CT64_aes256_gcm_init(uint64_t *ctx, uint8_t *key) { uint8_t gcm_key[16U] = { 0U }; uint8_t nonce0[12U] = { 0U }; uint64_t *aes_ctx = ctx; uint64_t *gcm_ctx = ctx + (uint32_t)128U; - Hacl_AES_128_BitSlice_aes128_init(aes_ctx, key, nonce0); - Hacl_AES_128_BitSlice_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); - Hacl_Gf128_PreComp_gcm_init(gcm_ctx, gcm_key); + Hacl_AES_256_CTR32_BitSlice_aes256_init(aes_ctx, key, nonce0); + Hacl_AES_256_CTR32_BitSlice_aes256_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Hacl_Gf128_CT64_gcm_init(gcm_ctx, gcm_key); } void -Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( +Hacl_AES_256_GCM_CT64_aes256_gcm_encrypt( uint64_t *ctx, uint32_t len, uint8_t *out, @@ -56,7 +54,7 @@ Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( uint8_t *cip = out; uint64_t *aes_ctx = ctx; uint64_t *gcm_ctx = ctx + (uint32_t)128U; - uint64_t *tag_mix = ctx + (uint32_t)394U; + uint64_t *tag_mix = ctx + (uint32_t)146U; uint32_t ctr; uint8_t tag_mix10[16U] = { 0U }; uint8_t gcm_key[16U] = { 0U }; @@ -66,12 +64,12 @@ Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( if (iv_len == (uint32_t)12U) { uint64_t *aes_ctx1 = ctx; - Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, iv); - Hacl_AES_128_BitSlice_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); + Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(aes_ctx1, iv); + Hacl_AES_256_CTR32_BitSlice_aes256_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); uint64_t u = load64_le(tag_mix10); - ctx[394U] = u; + ctx[146U] = u; uint64_t u0 = load64_le(tag_mix10 + (uint32_t)8U); - ctx[395U] = u0; + ctx[147U] = u0; ctr = (uint32_t)2U; } else @@ -80,31 +78,33 @@ Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( uint64_t *gcm_ctx1 = ctx + (uint32_t)128U; store64_be(gcm_key + (uint32_t)8U, gcm_ctx1[8U]); store64_be(gcm_key, gcm_ctx1[9U]); - Hacl_Gf128_PreComp_ghash(tag_iv, iv_len, iv, gcm_key); + Hacl_Gf128_CT64_ghash(tag_iv, iv_len, iv, gcm_key); store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); KRML_MAYBE_FOR16(i, (uint32_t)0U, (uint32_t)16U, (uint32_t)1U, size_iv[i] = tag_iv[i] ^ size_iv[i];); - Hacl_Gf128_PreComp_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); - Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, tag_iv); + Hacl_Gf128_CT64_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(aes_ctx1, tag_iv); uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); uint32_t ctr0 = u0; - Hacl_AES_128_BitSlice_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + Hacl_AES_256_CTR32_BitSlice_aes256_key_block(tag_mix1, aes_ctx1, ctr0); uint64_t u = load64_le(tag_mix1); - ctx[394U] = u; + ctx[146U] = u; uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); - ctx[395U] = u1; + ctx[147U] = u1; ctr = ctr0 + (uint32_t)1U; } - Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, cip, text, aes_ctx, ctr); - Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); - Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, len, cip); + Hacl_AES_256_CTR32_BitSlice_aes256_ctr(len, cip, text, aes_ctx, ctr); + gcm_ctx[0U] = (uint64_t)0U; + gcm_ctx[1U] = (uint64_t)0U; + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, len, cip); store64_be(tmp, (uint64_t)(aad_len * (uint32_t)8U)); store64_be(tmp + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); - Hacl_Gf128_PreComp_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); - Hacl_Gf128_PreComp_gcm_emit(tmp, gcm_ctx); + Hacl_Gf128_CT64_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); + Hacl_Gf128_CT64_gcm_emit(tmp, gcm_ctx); uint64_t u0 = load64_le(tmp); uint64_t tmp0 = u0; uint64_t u = load64_le(tmp + (uint32_t)8U); @@ -113,12 +113,10 @@ Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( uint64_t tmp11 = tmp1 ^ tag_mix[1U]; store64_le(out + len, tmp01); store64_le(out + len + (uint32_t)8U, tmp11); - gcm_ctx[0U] = (uint64_t)0U; - gcm_ctx[1U] = (uint64_t)0U; } bool -Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( +Hacl_AES_256_GCM_CT64_aes256_gcm_decrypt( uint64_t *ctx, uint32_t len, uint8_t *out, @@ -134,56 +132,58 @@ Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( uint8_t *result = scratch + (uint32_t)17U; uint8_t *ciphertext = cipher; uint8_t *tag = cipher + len; - uint64_t *aes_ctx = ctx; - uint64_t *gcm_ctx = ctx + (uint32_t)128U; - uint64_t *tag_mix = ctx + (uint32_t)394U; uint32_t ctr; - uint8_t tag_mix10[16U] = { 0U }; + uint8_t tag_mix0[16U] = { 0U }; uint8_t gcm_key[16U] = { 0U }; uint8_t tag_iv[16U] = { 0U }; uint8_t size_iv[16U] = { 0U }; uint8_t tag_mix1[16U] = { 0U }; if (iv_len == (uint32_t)12U) { - uint64_t *aes_ctx1 = ctx; - Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, iv); - Hacl_AES_128_BitSlice_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); - uint64_t u = load64_le(tag_mix10); - ctx[394U] = u; - uint64_t u0 = load64_le(tag_mix10 + (uint32_t)8U); - ctx[395U] = u0; + uint64_t *aes_ctx = ctx; + Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(aes_ctx, iv); + Hacl_AES_256_CTR32_BitSlice_aes256_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + uint64_t u = load64_le(tag_mix0); + ctx[146U] = u; + uint64_t u0 = load64_le(tag_mix0 + (uint32_t)8U); + ctx[147U] = u0; ctr = (uint32_t)2U; } else { - uint64_t *aes_ctx1 = ctx; - uint64_t *gcm_ctx1 = ctx + (uint32_t)128U; - store64_be(gcm_key + (uint32_t)8U, gcm_ctx1[8U]); - store64_be(gcm_key, gcm_ctx1[9U]); - Hacl_Gf128_PreComp_ghash(tag_iv, iv_len, iv, gcm_key); + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)128U; + store64_be(gcm_key + (uint32_t)8U, gcm_ctx[8U]); + store64_be(gcm_key, gcm_ctx[9U]); + Hacl_Gf128_CT64_ghash(tag_iv, iv_len, iv, gcm_key); store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); KRML_MAYBE_FOR16(i, (uint32_t)0U, (uint32_t)16U, (uint32_t)1U, size_iv[i] = tag_iv[i] ^ size_iv[i];); - Hacl_Gf128_PreComp_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); - Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, tag_iv); + Hacl_Gf128_CT64_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(aes_ctx, tag_iv); uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); uint32_t ctr0 = u0; - Hacl_AES_128_BitSlice_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + Hacl_AES_256_CTR32_BitSlice_aes256_key_block(tag_mix1, aes_ctx, ctr0); uint64_t u = load64_le(tag_mix1); - ctx[394U] = u; + ctx[146U] = u; uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); - ctx[395U] = u1; + ctx[147U] = u1; ctr = ctr0 + (uint32_t)1U; } - Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); - Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, len, ciphertext); + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)128U; + uint64_t *tag_mix = ctx + (uint32_t)146U; + gcm_ctx[0U] = (uint64_t)0U; + gcm_ctx[1U] = (uint64_t)0U; + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, len, ciphertext); store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); store64_be(text + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); - Hacl_Gf128_PreComp_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); - Hacl_Gf128_PreComp_gcm_emit(text, gcm_ctx); + Hacl_Gf128_CT64_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); + Hacl_Gf128_CT64_gcm_emit(text, gcm_ctx); uint64_t u0 = load64_le(text); uint64_t text0 = u0; uint64_t u = load64_le(text + (uint32_t)8U); @@ -200,7 +200,7 @@ Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( uint8_t res8 = result[0U]; if (res8 == (uint8_t)0U) { - Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, out, ciphertext, aes_ctx, ctr); + Hacl_AES_256_CTR32_BitSlice_aes256_ctr(len, out, ciphertext, aes_ctx, ctr); return true; } return false; diff --git a/src/Hacl_AES_256_GCM_NI.c b/src/Hacl_AES_256_GCM_NI.c new file mode 100644 index 00000000..7c415057 --- /dev/null +++ b/src/Hacl_AES_256_GCM_NI.c @@ -0,0 +1,182 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_256_GCM_NI.h" + +void Hacl_AES_256_GCM_NI_aes256_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key) +{ + uint8_t gcm_key[16U] = { 0U }; + uint8_t nonce0[12U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Hacl_AES_256_CTR32_NI_aes256_init(aes_ctx, key, nonce0); + Hacl_AES_256_CTR32_NI_aes256_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Hacl_Gf128_NI_gcm_init(gcm_ctx, gcm_key); +} + +void +Hacl_AES_256_GCM_NI_aes256_gcm_encrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint32_t ctr; + uint8_t tag_mix0[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Hacl_AES_256_CTR32_NI_aes256_set_nonce(aes_ctx, iv); + Hacl_AES_256_CTR32_NI_aes256_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); + ctr = (uint32_t)2U; + } + else + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx[4U]); + Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_256_CTR32_NI_aes256_set_nonce(aes_ctx, tag_iv); + uint32_t u = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u; + Hacl_AES_256_CTR32_NI_aes256_key_block(tag_mix1, aes_ctx, ctr0); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + ctr = ctr0 + (uint32_t)1U; + } + uint8_t *cip = out; + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Hacl_AES_256_CTR32_NI_aes256_ctr(len, cip, text, aes_ctx, ctr); + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; + gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, cip); + uint8_t tmp[16U] = { 0U }; + store64_be(tmp, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(tmp + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_NI_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); + Hacl_Gf128_NI_gcm_emit(tmp, gcm_ctx); + Lib_IntVector_Intrinsics_vec128 tmp_vec = Lib_IntVector_Intrinsics_vec128_load128_le(tmp); + Lib_IntVector_Intrinsics_vec128 + tmp_vec1 = Lib_IntVector_Intrinsics_vec128_xor(tmp_vec, tag_mix); + Lib_IntVector_Intrinsics_vec128_store128_le(out + len, tmp_vec1); +} + +bool +Hacl_AES_256_GCM_NI_aes256_gcm_decrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t scratch[18U] = { 0U }; + uint8_t *text = scratch; + uint8_t *result = scratch + (uint32_t)17U; + uint8_t *ciphertext = cipher; + uint8_t *tag = cipher + len; + uint32_t ctr; + uint8_t tag_mix0[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Hacl_AES_256_CTR32_NI_aes256_set_nonce(aes_ctx, iv); + Hacl_AES_256_CTR32_NI_aes256_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); + ctr = (uint32_t)2U; + } + else + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx[4U]); + Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_256_CTR32_NI_aes256_set_nonce(aes_ctx, tag_iv); + uint32_t u = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u; + Hacl_AES_256_CTR32_NI_aes256_key_block(tag_mix1, aes_ctx, ctr0); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + ctr = ctr0 + (uint32_t)1U; + } + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; + gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, ciphertext); + store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(text + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_NI_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); + Hacl_Gf128_NI_gcm_emit(text, gcm_ctx); + Lib_IntVector_Intrinsics_vec128 text_vec = Lib_IntVector_Intrinsics_vec128_load128_le(text); + Lib_IntVector_Intrinsics_vec128 + text_vec1 = Lib_IntVector_Intrinsics_vec128_xor(text_vec, tag_mix); + Lib_IntVector_Intrinsics_vec128_store128_le(text, text_vec1); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + result[0U] = result[0U] | (text[i] ^ tag[i]);); + uint8_t res8 = result[0U]; + if (res8 == (uint8_t)0U) + { + Hacl_AES_256_CTR32_NI_aes256_ctr(len, out, ciphertext, aes_ctx, ctr); + return true; + } + return false; +} + diff --git a/src/Hacl_Gf128_CT64.c b/src/Hacl_Gf128_CT64.c new file mode 100644 index 00000000..1d0f8bc6 --- /dev/null +++ b/src/Hacl_Gf128_CT64.c @@ -0,0 +1,1801 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_Gf128_CT64.h" + +static inline void fmul0(uint64_t *x, uint64_t *y) +{ + uint64_t uu____0 = y[0U]; + uint64_t + x10 = + (uu____0 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____0 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x20 = + (x10 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x10 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x30 = + (x20 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x20 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x4 = + (x30 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x30 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x5 = + (x4 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x4 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t yr1 = x5 << (uint32_t)32U | x5 >> (uint32_t)32U; + uint64_t uu____1 = y[1U]; + uint64_t + x11 = + (uu____1 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____1 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x21 = + (x11 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x11 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x31 = + (x21 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x21 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x40 = + (x31 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x31 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x50 = + (x40 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x40 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t yr2 = x50 << (uint32_t)32U | x50 >> (uint32_t)32U; + uint64_t uu____2 = x[0U]; + uint64_t uu____3 = x[1U]; + uint64_t uu____4 = y[0U]; + uint64_t uu____5 = y[1U]; + uint64_t uu____6 = y[0U] ^ y[1U]; + uint64_t + x12 = + (uu____2 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____2 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x22 = + (x12 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x12 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x32 = + (x22 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x22 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x41 = + (x32 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x32 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x51 = + (x41 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x41 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y1r = x51 << (uint32_t)32U | x51 >> (uint32_t)32U; + uint64_t + x13 = + (uu____3 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____3 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x23 = + (x13 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x13 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x33 = + (x23 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x23 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x42 = + (x33 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x33 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x52 = + (x42 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x42 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y2r = x52 << (uint32_t)32U | x52 >> (uint32_t)32U; + uint64_t y3 = uu____2 ^ uu____3; + uint64_t y3r = y1r ^ y2r; + uint64_t x00 = uu____2 & (uint64_t)0x1111111111111111U; + uint64_t x14 = uu____2 & (uint64_t)0x2222222222222222U; + uint64_t x24 = uu____2 & (uint64_t)0x4444444444444444U; + uint64_t x34 = uu____2 & (uint64_t)0x8888888888888888U; + uint64_t y00 = uu____4 & (uint64_t)0x1111111111111111U; + uint64_t y10 = uu____4 & (uint64_t)0x2222222222222222U; + uint64_t y20 = uu____4 & (uint64_t)0x4444444444444444U; + uint64_t y310 = uu____4 & (uint64_t)0x8888888888888888U; + uint64_t z0 = x00 * y00 ^ (x14 * y310 ^ (x24 * y20 ^ x34 * y10)); + uint64_t z10 = x00 * y10 ^ (x14 * y00 ^ (x24 * y310 ^ x34 * y20)); + uint64_t z20 = x00 * y20 ^ (x14 * y10 ^ (x24 * y00 ^ x34 * y310)); + uint64_t z30 = x00 * y310 ^ (x14 * y20 ^ (x24 * y10 ^ x34 * y00)); + uint64_t + z00 = + (((z0 & (uint64_t)0x1111111111111111U) | (z10 & (uint64_t)0x2222222222222222U)) + | (z20 & (uint64_t)0x4444444444444444U)) + | (z30 & (uint64_t)0x8888888888888888U); + uint64_t x01 = uu____3 & (uint64_t)0x1111111111111111U; + uint64_t x15 = uu____3 & (uint64_t)0x2222222222222222U; + uint64_t x25 = uu____3 & (uint64_t)0x4444444444444444U; + uint64_t x35 = uu____3 & (uint64_t)0x8888888888888888U; + uint64_t y01 = uu____5 & (uint64_t)0x1111111111111111U; + uint64_t y11 = uu____5 & (uint64_t)0x2222222222222222U; + uint64_t y21 = uu____5 & (uint64_t)0x4444444444444444U; + uint64_t y311 = uu____5 & (uint64_t)0x8888888888888888U; + uint64_t z010 = x01 * y01 ^ (x15 * y311 ^ (x25 * y21 ^ x35 * y11)); + uint64_t z12 = x01 * y11 ^ (x15 * y01 ^ (x25 * y311 ^ x35 * y21)); + uint64_t z22 = x01 * y21 ^ (x15 * y11 ^ (x25 * y01 ^ x35 * y311)); + uint64_t z31 = x01 * y311 ^ (x15 * y21 ^ (x25 * y11 ^ x35 * y01)); + uint64_t + z13 = + (((z010 & (uint64_t)0x1111111111111111U) | (z12 & (uint64_t)0x2222222222222222U)) + | (z22 & (uint64_t)0x4444444444444444U)) + | (z31 & (uint64_t)0x8888888888888888U); + uint64_t x02 = y3 & (uint64_t)0x1111111111111111U; + uint64_t x16 = y3 & (uint64_t)0x2222222222222222U; + uint64_t x26 = y3 & (uint64_t)0x4444444444444444U; + uint64_t x36 = y3 & (uint64_t)0x8888888888888888U; + uint64_t y02 = uu____6 & (uint64_t)0x1111111111111111U; + uint64_t y12 = uu____6 & (uint64_t)0x2222222222222222U; + uint64_t y22 = uu____6 & (uint64_t)0x4444444444444444U; + uint64_t y312 = uu____6 & (uint64_t)0x8888888888888888U; + uint64_t z011 = x02 * y02 ^ (x16 * y312 ^ (x26 * y22 ^ x36 * y12)); + uint64_t z110 = x02 * y12 ^ (x16 * y02 ^ (x26 * y312 ^ x36 * y22)); + uint64_t z23 = x02 * y22 ^ (x16 * y12 ^ (x26 * y02 ^ x36 * y312)); + uint64_t z32 = x02 * y312 ^ (x16 * y22 ^ (x26 * y12 ^ x36 * y02)); + uint64_t + z24 = + (((z011 & (uint64_t)0x1111111111111111U) | (z110 & (uint64_t)0x2222222222222222U)) + | (z23 & (uint64_t)0x4444444444444444U)) + | (z32 & (uint64_t)0x8888888888888888U); + uint64_t x03 = y1r & (uint64_t)0x1111111111111111U; + uint64_t x17 = y1r & (uint64_t)0x2222222222222222U; + uint64_t x27 = y1r & (uint64_t)0x4444444444444444U; + uint64_t x37 = y1r & (uint64_t)0x8888888888888888U; + uint64_t y03 = yr1 & (uint64_t)0x1111111111111111U; + uint64_t y13 = yr1 & (uint64_t)0x2222222222222222U; + uint64_t y23 = yr1 & (uint64_t)0x4444444444444444U; + uint64_t y313 = yr1 & (uint64_t)0x8888888888888888U; + uint64_t z012 = x03 * y03 ^ (x17 * y313 ^ (x27 * y23 ^ x37 * y13)); + uint64_t z111 = x03 * y13 ^ (x17 * y03 ^ (x27 * y313 ^ x37 * y23)); + uint64_t z210 = x03 * y23 ^ (x17 * y13 ^ (x27 * y03 ^ x37 * y313)); + uint64_t z33 = x03 * y313 ^ (x17 * y23 ^ (x27 * y13 ^ x37 * y03)); + uint64_t + z0h = + (((z012 & (uint64_t)0x1111111111111111U) | (z111 & (uint64_t)0x2222222222222222U)) + | (z210 & (uint64_t)0x4444444444444444U)) + | (z33 & (uint64_t)0x8888888888888888U); + uint64_t x04 = y2r & (uint64_t)0x1111111111111111U; + uint64_t x18 = y2r & (uint64_t)0x2222222222222222U; + uint64_t x28 = y2r & (uint64_t)0x4444444444444444U; + uint64_t x38 = y2r & (uint64_t)0x8888888888888888U; + uint64_t y04 = yr2 & (uint64_t)0x1111111111111111U; + uint64_t y14 = yr2 & (uint64_t)0x2222222222222222U; + uint64_t y24 = yr2 & (uint64_t)0x4444444444444444U; + uint64_t y314 = yr2 & (uint64_t)0x8888888888888888U; + uint64_t z013 = x04 * y04 ^ (x18 * y314 ^ (x28 * y24 ^ x38 * y14)); + uint64_t z112 = x04 * y14 ^ (x18 * y04 ^ (x28 * y314 ^ x38 * y24)); + uint64_t z211 = x04 * y24 ^ (x18 * y14 ^ (x28 * y04 ^ x38 * y314)); + uint64_t z34 = x04 * y314 ^ (x18 * y24 ^ (x28 * y14 ^ x38 * y04)); + uint64_t + z1h = + (((z013 & (uint64_t)0x1111111111111111U) | (z112 & (uint64_t)0x2222222222222222U)) + | (z211 & (uint64_t)0x4444444444444444U)) + | (z34 & (uint64_t)0x8888888888888888U); + uint64_t x0 = y3r & (uint64_t)0x1111111111111111U; + uint64_t x19 = y3r & (uint64_t)0x2222222222222222U; + uint64_t x29 = y3r & (uint64_t)0x4444444444444444U; + uint64_t x3 = y3r & (uint64_t)0x8888888888888888U; + uint64_t y0 = (yr1 ^ yr2) & (uint64_t)0x1111111111111111U; + uint64_t y1 = (yr1 ^ yr2) & (uint64_t)0x2222222222222222U; + uint64_t y2 = (yr1 ^ yr2) & (uint64_t)0x4444444444444444U; + uint64_t y31 = (yr1 ^ yr2) & (uint64_t)0x8888888888888888U; + uint64_t z01 = x0 * y0 ^ (x19 * y31 ^ (x29 * y2 ^ x3 * y1)); + uint64_t z11 = x0 * y1 ^ (x19 * y0 ^ (x29 * y31 ^ x3 * y2)); + uint64_t z212 = x0 * y2 ^ (x19 * y1 ^ (x29 * y0 ^ x3 * y31)); + uint64_t z35 = x0 * y31 ^ (x19 * y2 ^ (x29 * y1 ^ x3 * y0)); + uint64_t + z2h = + (((z01 & (uint64_t)0x1111111111111111U) | (z11 & (uint64_t)0x2222222222222222U)) + | (z212 & (uint64_t)0x4444444444444444U)) + | (z35 & (uint64_t)0x8888888888888888U); + uint64_t z21 = z24 ^ (z00 ^ z13); + uint64_t z2h1 = z2h ^ (z0h ^ z1h); + uint64_t + x110 = + (z0h & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z0h >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x210 = + (x110 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x110 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x39 = + (x210 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x210 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x43 = + (x39 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x39 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x53 = + (x43 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x43 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z0h1 = (x53 << (uint32_t)32U | x53 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x111 = + (z1h & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z1h >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x211 = + (x111 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x111 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x310 = + (x211 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x211 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x44 = + (x310 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x310 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x54 = + (x44 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x44 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z1h1 = (x54 << (uint32_t)32U | x54 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x1 = + (z2h1 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z2h1 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x212 = + (x1 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x1 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x311 = + (x212 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x212 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x45 = + (x311 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x311 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x55 = + (x45 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x45 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z2h2 = (x55 << (uint32_t)32U | x55 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t z1 = z00; + uint64_t z2 = z0h1 ^ z21; + uint64_t z3 = z13 ^ z2h2; + uint64_t z4 = z1h1; + uint64_t v3 = z4 << (uint32_t)1U | z3 >> (uint32_t)63U; + uint64_t v2 = z3 << (uint32_t)1U | z2 >> (uint32_t)63U; + uint64_t v1 = z2 << (uint32_t)1U | z1 >> (uint32_t)63U; + uint64_t v0 = z1 << (uint32_t)1U; + uint64_t v21 = v2 ^ (v0 ^ (v0 >> (uint32_t)1U ^ (v0 >> (uint32_t)2U ^ v0 >> (uint32_t)7U))); + uint64_t v11 = v1 ^ (v0 << (uint32_t)63U ^ (v0 << (uint32_t)62U ^ v0 << (uint32_t)57U)); + uint64_t + v31 = v3 ^ (v11 ^ (v11 >> (uint32_t)1U ^ (v11 >> (uint32_t)2U ^ v11 >> (uint32_t)7U))); + uint64_t v22 = v21 ^ (v11 << (uint32_t)63U ^ (v11 << (uint32_t)62U ^ v11 << (uint32_t)57U)); + uint64_t x112 = v22; + uint64_t x2 = v31; + x[0U] = x112; + x[1U] = x2; +} + +static inline void load_precompute_r(uint64_t *pre, uint8_t *key) +{ + uint64_t *h1_0 = pre + (uint32_t)6U; + uint64_t *h2_0 = pre + (uint32_t)4U; + uint64_t *h3_0 = pre + (uint32_t)2U; + uint64_t *h4_0 = pre; + uint64_t u = load64_be(key); + h1_0[1U] = u; + uint64_t u0 = load64_be(key + (uint32_t)8U); + h1_0[0U] = u0; + h2_0[0U] = h1_0[0U]; + h2_0[1U] = h1_0[1U]; + h3_0[0U] = h1_0[0U]; + h3_0[1U] = h1_0[1U]; + h4_0[0U] = h1_0[0U]; + h4_0[1U] = h1_0[1U]; + fmul0(h2_0, h1_0); + fmul0(h3_0, h2_0); + fmul0(h4_0, h3_0); + uint64_t uu____0 = h1_0[0U]; + uint64_t + x = + (uu____0 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____0 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x1 = + (x & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x2 = + (x1 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x1 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x3 = + (x2 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x2 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x4 = + (x3 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x3 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[14U] = x4 << (uint32_t)32U | x4 >> (uint32_t)32U; + uint64_t uu____1 = h1_0[1U]; + uint64_t + x0 = + (uu____1 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____1 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x10 = + (x0 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x0 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x20 = + (x10 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x10 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x30 = + (x20 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x20 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x40 = + (x30 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x30 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[15U] = x40 << (uint32_t)32U | x40 >> (uint32_t)32U; + uint64_t uu____2 = h2_0[0U]; + uint64_t + x5 = + (uu____2 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____2 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x11 = + (x5 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x5 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x21 = + (x11 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x11 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x31 = + (x21 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x21 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x41 = + (x31 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x31 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[12U] = x41 << (uint32_t)32U | x41 >> (uint32_t)32U; + uint64_t uu____3 = h2_0[1U]; + uint64_t + x6 = + (uu____3 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____3 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x12 = + (x6 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x6 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x22 = + (x12 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x12 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x32 = + (x22 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x22 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x42 = + (x32 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x32 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[13U] = x42 << (uint32_t)32U | x42 >> (uint32_t)32U; + uint64_t uu____4 = h3_0[0U]; + uint64_t + x7 = + (uu____4 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____4 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x13 = + (x7 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x7 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x23 = + (x13 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x13 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x33 = + (x23 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x23 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x43 = + (x33 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x33 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[10U] = x43 << (uint32_t)32U | x43 >> (uint32_t)32U; + uint64_t uu____5 = h3_0[1U]; + uint64_t + x8 = + (uu____5 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____5 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x14 = + (x8 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x8 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x24 = + (x14 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x14 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x34 = + (x24 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x24 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x44 = + (x34 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x34 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[11U] = x44 << (uint32_t)32U | x44 >> (uint32_t)32U; + uint64_t uu____6 = h4_0[0U]; + uint64_t + x9 = + (uu____6 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____6 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x15 = + (x9 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x9 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x25 = + (x15 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x15 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x35 = + (x25 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x25 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x45 = + (x35 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x35 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[8U] = x45 << (uint32_t)32U | x45 >> (uint32_t)32U; + uint64_t uu____7 = h4_0[1U]; + uint64_t + x16 = + (uu____7 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____7 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x17 = + (x16 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x16 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x26 = + (x17 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x17 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x36 = + (x26 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x26 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x46 = + (x36 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x36 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[9U] = x46 << (uint32_t)32U | x46 >> (uint32_t)32U; +} + +static inline void normalize4(uint64_t *acc, uint64_t *x, uint64_t *pre) +{ + uint64_t *x1 = x; + uint64_t *x2 = x + (uint32_t)2U; + uint64_t *x3 = x + (uint32_t)4U; + uint64_t *x4 = x + (uint32_t)6U; + uint64_t *y1 = pre; + uint64_t *y2 = pre + (uint32_t)2U; + uint64_t *y3 = pre + (uint32_t)4U; + uint64_t *y4 = pre + (uint32_t)6U; + uint64_t *yr1 = pre + (uint32_t)8U; + uint64_t *yr2 = pre + (uint32_t)10U; + uint64_t *yr3 = pre + (uint32_t)12U; + uint64_t *yr4 = pre + (uint32_t)14U; + uint64_t uu____0 = x1[0U]; + uint64_t uu____1 = x1[1U]; + uint64_t uu____2 = y1[0U]; + uint64_t uu____3 = y1[1U]; + uint64_t uu____4 = y1[0U] ^ y1[1U]; + uint64_t uu____5 = yr1[0U]; + uint64_t uu____6 = yr1[1U]; + uint64_t uu____7 = yr1[0U] ^ yr1[1U]; + uint64_t + x50 = + (uu____0 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____0 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x6 = + (x50 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x50 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x7 = + (x6 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x6 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x8 = + (x7 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x7 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x9 = + (x8 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x8 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y1r = x9 << (uint32_t)32U | x9 >> (uint32_t)32U; + uint64_t + x51 = + (uu____1 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____1 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x60 = + (x51 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x51 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x70 = + (x60 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x60 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x80 = + (x70 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x70 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x90 = + (x80 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x80 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y2r = x90 << (uint32_t)32U | x90 >> (uint32_t)32U; + uint64_t y310 = uu____0 ^ uu____1; + uint64_t y3r0 = y1r ^ y2r; + uint64_t x00 = uu____0 & (uint64_t)0x1111111111111111U; + uint64_t x110 = uu____0 & (uint64_t)0x2222222222222222U; + uint64_t x210 = uu____0 & (uint64_t)0x4444444444444444U; + uint64_t x310 = uu____0 & (uint64_t)0x8888888888888888U; + uint64_t y00 = uu____2 & (uint64_t)0x1111111111111111U; + uint64_t y110 = uu____2 & (uint64_t)0x2222222222222222U; + uint64_t y210 = uu____2 & (uint64_t)0x4444444444444444U; + uint64_t y320 = uu____2 & (uint64_t)0x8888888888888888U; + uint64_t z00 = x00 * y00 ^ (x110 * y320 ^ (x210 * y210 ^ x310 * y110)); + uint64_t z10 = x00 * y110 ^ (x110 * y00 ^ (x210 * y320 ^ x310 * y210)); + uint64_t z20 = x00 * y210 ^ (x110 * y110 ^ (x210 * y00 ^ x310 * y320)); + uint64_t z30 = x00 * y320 ^ (x110 * y210 ^ (x210 * y110 ^ x310 * y00)); + uint64_t + z02 = + (((z00 & (uint64_t)0x1111111111111111U) | (z10 & (uint64_t)0x2222222222222222U)) + | (z20 & (uint64_t)0x4444444444444444U)) + | (z30 & (uint64_t)0x8888888888888888U); + uint64_t x01 = uu____1 & (uint64_t)0x1111111111111111U; + uint64_t x111 = uu____1 & (uint64_t)0x2222222222222222U; + uint64_t x211 = uu____1 & (uint64_t)0x4444444444444444U; + uint64_t x311 = uu____1 & (uint64_t)0x8888888888888888U; + uint64_t y01 = uu____3 & (uint64_t)0x1111111111111111U; + uint64_t y111 = uu____3 & (uint64_t)0x2222222222222222U; + uint64_t y211 = uu____3 & (uint64_t)0x4444444444444444U; + uint64_t y321 = uu____3 & (uint64_t)0x8888888888888888U; + uint64_t z010 = x01 * y01 ^ (x111 * y321 ^ (x211 * y211 ^ x311 * y111)); + uint64_t z14 = x01 * y111 ^ (x111 * y01 ^ (x211 * y321 ^ x311 * y211)); + uint64_t z24 = x01 * y211 ^ (x111 * y111 ^ (x211 * y01 ^ x311 * y321)); + uint64_t z33 = x01 * y321 ^ (x111 * y211 ^ (x211 * y111 ^ x311 * y01)); + uint64_t + z15 = + (((z010 & (uint64_t)0x1111111111111111U) | (z14 & (uint64_t)0x2222222222222222U)) + | (z24 & (uint64_t)0x4444444444444444U)) + | (z33 & (uint64_t)0x8888888888888888U); + uint64_t x02 = y310 & (uint64_t)0x1111111111111111U; + uint64_t x112 = y310 & (uint64_t)0x2222222222222222U; + uint64_t x212 = y310 & (uint64_t)0x4444444444444444U; + uint64_t x312 = y310 & (uint64_t)0x8888888888888888U; + uint64_t y02 = uu____4 & (uint64_t)0x1111111111111111U; + uint64_t y112 = uu____4 & (uint64_t)0x2222222222222222U; + uint64_t y212 = uu____4 & (uint64_t)0x4444444444444444U; + uint64_t y322 = uu____4 & (uint64_t)0x8888888888888888U; + uint64_t z011 = x02 * y02 ^ (x112 * y322 ^ (x212 * y212 ^ x312 * y112)); + uint64_t z110 = x02 * y112 ^ (x112 * y02 ^ (x212 * y322 ^ x312 * y212)); + uint64_t z25 = x02 * y212 ^ (x112 * y112 ^ (x212 * y02 ^ x312 * y322)); + uint64_t z34 = x02 * y322 ^ (x112 * y212 ^ (x212 * y112 ^ x312 * y02)); + uint64_t + z26 = + (((z011 & (uint64_t)0x1111111111111111U) | (z110 & (uint64_t)0x2222222222222222U)) + | (z25 & (uint64_t)0x4444444444444444U)) + | (z34 & (uint64_t)0x8888888888888888U); + uint64_t x03 = y1r & (uint64_t)0x1111111111111111U; + uint64_t x113 = y1r & (uint64_t)0x2222222222222222U; + uint64_t x213 = y1r & (uint64_t)0x4444444444444444U; + uint64_t x313 = y1r & (uint64_t)0x8888888888888888U; + uint64_t y03 = uu____5 & (uint64_t)0x1111111111111111U; + uint64_t y113 = uu____5 & (uint64_t)0x2222222222222222U; + uint64_t y213 = uu____5 & (uint64_t)0x4444444444444444U; + uint64_t y323 = uu____5 & (uint64_t)0x8888888888888888U; + uint64_t z012 = x03 * y03 ^ (x113 * y323 ^ (x213 * y213 ^ x313 * y113)); + uint64_t z111 = x03 * y113 ^ (x113 * y03 ^ (x213 * y323 ^ x313 * y213)); + uint64_t z210 = x03 * y213 ^ (x113 * y113 ^ (x213 * y03 ^ x313 * y323)); + uint64_t z35 = x03 * y323 ^ (x113 * y213 ^ (x213 * y113 ^ x313 * y03)); + uint64_t + z0h = + (((z012 & (uint64_t)0x1111111111111111U) | (z111 & (uint64_t)0x2222222222222222U)) + | (z210 & (uint64_t)0x4444444444444444U)) + | (z35 & (uint64_t)0x8888888888888888U); + uint64_t x04 = y2r & (uint64_t)0x1111111111111111U; + uint64_t x114 = y2r & (uint64_t)0x2222222222222222U; + uint64_t x214 = y2r & (uint64_t)0x4444444444444444U; + uint64_t x314 = y2r & (uint64_t)0x8888888888888888U; + uint64_t y04 = uu____6 & (uint64_t)0x1111111111111111U; + uint64_t y114 = uu____6 & (uint64_t)0x2222222222222222U; + uint64_t y214 = uu____6 & (uint64_t)0x4444444444444444U; + uint64_t y324 = uu____6 & (uint64_t)0x8888888888888888U; + uint64_t z013 = x04 * y04 ^ (x114 * y324 ^ (x214 * y214 ^ x314 * y114)); + uint64_t z112 = x04 * y114 ^ (x114 * y04 ^ (x214 * y324 ^ x314 * y214)); + uint64_t z211 = x04 * y214 ^ (x114 * y114 ^ (x214 * y04 ^ x314 * y324)); + uint64_t z36 = x04 * y324 ^ (x114 * y214 ^ (x214 * y114 ^ x314 * y04)); + uint64_t + z1h = + (((z013 & (uint64_t)0x1111111111111111U) | (z112 & (uint64_t)0x2222222222222222U)) + | (z211 & (uint64_t)0x4444444444444444U)) + | (z36 & (uint64_t)0x8888888888888888U); + uint64_t x05 = y3r0 & (uint64_t)0x1111111111111111U; + uint64_t x115 = y3r0 & (uint64_t)0x2222222222222222U; + uint64_t x215 = y3r0 & (uint64_t)0x4444444444444444U; + uint64_t x315 = y3r0 & (uint64_t)0x8888888888888888U; + uint64_t y05 = uu____7 & (uint64_t)0x1111111111111111U; + uint64_t y115 = uu____7 & (uint64_t)0x2222222222222222U; + uint64_t y215 = uu____7 & (uint64_t)0x4444444444444444U; + uint64_t y325 = uu____7 & (uint64_t)0x8888888888888888U; + uint64_t z014 = x05 * y05 ^ (x115 * y325 ^ (x215 * y215 ^ x315 * y115)); + uint64_t z113 = x05 * y115 ^ (x115 * y05 ^ (x215 * y325 ^ x315 * y215)); + uint64_t z212 = x05 * y215 ^ (x115 * y115 ^ (x215 * y05 ^ x315 * y325)); + uint64_t z37 = x05 * y325 ^ (x115 * y215 ^ (x215 * y115 ^ x315 * y05)); + uint64_t + z2h = + (((z014 & (uint64_t)0x1111111111111111U) | (z113 & (uint64_t)0x2222222222222222U)) + | (z212 & (uint64_t)0x4444444444444444U)) + | (z37 & (uint64_t)0x8888888888888888U); + uint64_t z213 = z26 ^ (z02 ^ z15); + uint64_t z2h10 = z2h ^ (z0h ^ z1h); + uint64_t + x52 = + (z0h & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z0h >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x61 = + (x52 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x52 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x71 = + (x61 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x61 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x81 = + (x71 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x71 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x91 = + (x81 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x81 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z0h1 = (x91 << (uint32_t)32U | x91 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x53 = + (z1h & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z1h >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x62 = + (x53 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x53 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x72 = + (x62 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x62 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x82 = + (x72 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x72 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x92 = + (x82 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x82 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z1h1 = (x92 << (uint32_t)32U | x92 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x54 = + (z2h10 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z2h10 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x63 = + (x54 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x54 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x73 = + (x63 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x63 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x83 = + (x73 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x73 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x93 = + (x83 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x83 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z2h2 = (x93 << (uint32_t)32U | x93 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t z1_1 = z02; + uint64_t z1_2 = z0h1 ^ z213; + uint64_t z1_3 = z15 ^ z2h2; + uint64_t z1_4 = z1h1; + uint64_t uu____8 = x2[0U]; + uint64_t uu____9 = x2[1U]; + uint64_t uu____10 = y2[0U]; + uint64_t uu____11 = y2[1U]; + uint64_t uu____12 = y2[0U] ^ y2[1U]; + uint64_t uu____13 = yr2[0U]; + uint64_t uu____14 = yr2[1U]; + uint64_t uu____15 = yr2[0U] ^ yr2[1U]; + uint64_t + x55 = + (uu____8 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____8 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x64 = + (x55 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x55 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x74 = + (x64 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x64 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x84 = + (x74 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x74 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x94 = + (x84 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x84 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y1r0 = x94 << (uint32_t)32U | x94 >> (uint32_t)32U; + uint64_t + x56 = + (uu____9 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____9 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x65 = + (x56 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x56 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x75 = + (x65 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x65 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x85 = + (x75 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x75 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x95 = + (x85 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x85 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y2r0 = x95 << (uint32_t)32U | x95 >> (uint32_t)32U; + uint64_t y311 = uu____8 ^ uu____9; + uint64_t y3r1 = y1r0 ^ y2r0; + uint64_t x06 = uu____8 & (uint64_t)0x1111111111111111U; + uint64_t x116 = uu____8 & (uint64_t)0x2222222222222222U; + uint64_t x216 = uu____8 & (uint64_t)0x4444444444444444U; + uint64_t x316 = uu____8 & (uint64_t)0x8888888888888888U; + uint64_t y06 = uu____10 & (uint64_t)0x1111111111111111U; + uint64_t y116 = uu____10 & (uint64_t)0x2222222222222222U; + uint64_t y216 = uu____10 & (uint64_t)0x4444444444444444U; + uint64_t y326 = uu____10 & (uint64_t)0x8888888888888888U; + uint64_t z03 = x06 * y06 ^ (x116 * y326 ^ (x216 * y216 ^ x316 * y116)); + uint64_t z16 = x06 * y116 ^ (x116 * y06 ^ (x216 * y326 ^ x316 * y216)); + uint64_t z27 = x06 * y216 ^ (x116 * y116 ^ (x216 * y06 ^ x316 * y326)); + uint64_t z38 = x06 * y326 ^ (x116 * y216 ^ (x216 * y116 ^ x316 * y06)); + uint64_t + z04 = + (((z03 & (uint64_t)0x1111111111111111U) | (z16 & (uint64_t)0x2222222222222222U)) + | (z27 & (uint64_t)0x4444444444444444U)) + | (z38 & (uint64_t)0x8888888888888888U); + uint64_t x07 = uu____9 & (uint64_t)0x1111111111111111U; + uint64_t x117 = uu____9 & (uint64_t)0x2222222222222222U; + uint64_t x217 = uu____9 & (uint64_t)0x4444444444444444U; + uint64_t x317 = uu____9 & (uint64_t)0x8888888888888888U; + uint64_t y07 = uu____11 & (uint64_t)0x1111111111111111U; + uint64_t y117 = uu____11 & (uint64_t)0x2222222222222222U; + uint64_t y217 = uu____11 & (uint64_t)0x4444444444444444U; + uint64_t y327 = uu____11 & (uint64_t)0x8888888888888888U; + uint64_t z015 = x07 * y07 ^ (x117 * y327 ^ (x217 * y217 ^ x317 * y117)); + uint64_t z17 = x07 * y117 ^ (x117 * y07 ^ (x217 * y327 ^ x317 * y217)); + uint64_t z28 = x07 * y217 ^ (x117 * y117 ^ (x217 * y07 ^ x317 * y327)); + uint64_t z39 = x07 * y327 ^ (x117 * y217 ^ (x217 * y117 ^ x317 * y07)); + uint64_t + z18 = + (((z015 & (uint64_t)0x1111111111111111U) | (z17 & (uint64_t)0x2222222222222222U)) + | (z28 & (uint64_t)0x4444444444444444U)) + | (z39 & (uint64_t)0x8888888888888888U); + uint64_t x08 = y311 & (uint64_t)0x1111111111111111U; + uint64_t x118 = y311 & (uint64_t)0x2222222222222222U; + uint64_t x218 = y311 & (uint64_t)0x4444444444444444U; + uint64_t x318 = y311 & (uint64_t)0x8888888888888888U; + uint64_t y08 = uu____12 & (uint64_t)0x1111111111111111U; + uint64_t y118 = uu____12 & (uint64_t)0x2222222222222222U; + uint64_t y218 = uu____12 & (uint64_t)0x4444444444444444U; + uint64_t y328 = uu____12 & (uint64_t)0x8888888888888888U; + uint64_t z016 = x08 * y08 ^ (x118 * y328 ^ (x218 * y218 ^ x318 * y118)); + uint64_t z114 = x08 * y118 ^ (x118 * y08 ^ (x218 * y328 ^ x318 * y218)); + uint64_t z29 = x08 * y218 ^ (x118 * y118 ^ (x218 * y08 ^ x318 * y328)); + uint64_t z310 = x08 * y328 ^ (x118 * y218 ^ (x218 * y118 ^ x318 * y08)); + uint64_t + z214 = + (((z016 & (uint64_t)0x1111111111111111U) | (z114 & (uint64_t)0x2222222222222222U)) + | (z29 & (uint64_t)0x4444444444444444U)) + | (z310 & (uint64_t)0x8888888888888888U); + uint64_t x09 = y1r0 & (uint64_t)0x1111111111111111U; + uint64_t x119 = y1r0 & (uint64_t)0x2222222222222222U; + uint64_t x219 = y1r0 & (uint64_t)0x4444444444444444U; + uint64_t x319 = y1r0 & (uint64_t)0x8888888888888888U; + uint64_t y09 = uu____13 & (uint64_t)0x1111111111111111U; + uint64_t y119 = uu____13 & (uint64_t)0x2222222222222222U; + uint64_t y219 = uu____13 & (uint64_t)0x4444444444444444U; + uint64_t y329 = uu____13 & (uint64_t)0x8888888888888888U; + uint64_t z017 = x09 * y09 ^ (x119 * y329 ^ (x219 * y219 ^ x319 * y119)); + uint64_t z115 = x09 * y119 ^ (x119 * y09 ^ (x219 * y329 ^ x319 * y219)); + uint64_t z215 = x09 * y219 ^ (x119 * y119 ^ (x219 * y09 ^ x319 * y329)); + uint64_t z311 = x09 * y329 ^ (x119 * y219 ^ (x219 * y119 ^ x319 * y09)); + uint64_t + z0h0 = + (((z017 & (uint64_t)0x1111111111111111U) | (z115 & (uint64_t)0x2222222222222222U)) + | (z215 & (uint64_t)0x4444444444444444U)) + | (z311 & (uint64_t)0x8888888888888888U); + uint64_t x010 = y2r0 & (uint64_t)0x1111111111111111U; + uint64_t x1110 = y2r0 & (uint64_t)0x2222222222222222U; + uint64_t x2110 = y2r0 & (uint64_t)0x4444444444444444U; + uint64_t x3110 = y2r0 & (uint64_t)0x8888888888888888U; + uint64_t y010 = uu____14 & (uint64_t)0x1111111111111111U; + uint64_t y1110 = uu____14 & (uint64_t)0x2222222222222222U; + uint64_t y2110 = uu____14 & (uint64_t)0x4444444444444444U; + uint64_t y3210 = uu____14 & (uint64_t)0x8888888888888888U; + uint64_t z018 = x010 * y010 ^ (x1110 * y3210 ^ (x2110 * y2110 ^ x3110 * y1110)); + uint64_t z116 = x010 * y1110 ^ (x1110 * y010 ^ (x2110 * y3210 ^ x3110 * y2110)); + uint64_t z216 = x010 * y2110 ^ (x1110 * y1110 ^ (x2110 * y010 ^ x3110 * y3210)); + uint64_t z312 = x010 * y3210 ^ (x1110 * y2110 ^ (x2110 * y1110 ^ x3110 * y010)); + uint64_t + z1h0 = + (((z018 & (uint64_t)0x1111111111111111U) | (z116 & (uint64_t)0x2222222222222222U)) + | (z216 & (uint64_t)0x4444444444444444U)) + | (z312 & (uint64_t)0x8888888888888888U); + uint64_t x011 = y3r1 & (uint64_t)0x1111111111111111U; + uint64_t x1111 = y3r1 & (uint64_t)0x2222222222222222U; + uint64_t x2111 = y3r1 & (uint64_t)0x4444444444444444U; + uint64_t x3111 = y3r1 & (uint64_t)0x8888888888888888U; + uint64_t y011 = uu____15 & (uint64_t)0x1111111111111111U; + uint64_t y1111 = uu____15 & (uint64_t)0x2222222222222222U; + uint64_t y2111 = uu____15 & (uint64_t)0x4444444444444444U; + uint64_t y3211 = uu____15 & (uint64_t)0x8888888888888888U; + uint64_t z019 = x011 * y011 ^ (x1111 * y3211 ^ (x2111 * y2111 ^ x3111 * y1111)); + uint64_t z117 = x011 * y1111 ^ (x1111 * y011 ^ (x2111 * y3211 ^ x3111 * y2111)); + uint64_t z217 = x011 * y2111 ^ (x1111 * y1111 ^ (x2111 * y011 ^ x3111 * y3211)); + uint64_t z313 = x011 * y3211 ^ (x1111 * y2111 ^ (x2111 * y1111 ^ x3111 * y011)); + uint64_t + z2h0 = + (((z019 & (uint64_t)0x1111111111111111U) | (z117 & (uint64_t)0x2222222222222222U)) + | (z217 & (uint64_t)0x4444444444444444U)) + | (z313 & (uint64_t)0x8888888888888888U); + uint64_t z218 = z214 ^ (z04 ^ z18); + uint64_t z2h11 = z2h0 ^ (z0h0 ^ z1h0); + uint64_t + x57 = + (z0h0 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z0h0 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x66 = + (x57 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x57 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x76 = + (x66 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x66 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x86 = + (x76 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x76 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x96 = + (x86 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x86 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z0h10 = (x96 << (uint32_t)32U | x96 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x58 = + (z1h0 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z1h0 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x67 = + (x58 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x58 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x77 = + (x67 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x67 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x87 = + (x77 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x77 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x97 = + (x87 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x87 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z1h10 = (x97 << (uint32_t)32U | x97 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x59 = + (z2h11 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z2h11 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x68 = + (x59 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x59 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x78 = + (x68 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x68 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x88 = + (x78 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x78 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x98 = + (x88 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x88 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z2h20 = (x98 << (uint32_t)32U | x98 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t z2_1 = z04; + uint64_t z2_2 = z0h10 ^ z218; + uint64_t z2_3 = z18 ^ z2h20; + uint64_t z2_4 = z1h10; + uint64_t z1 = z1_1 ^ z2_1; + uint64_t z2 = z1_2 ^ z2_2; + uint64_t z3 = z1_3 ^ z2_3; + uint64_t z4 = z1_4 ^ z2_4; + uint64_t uu____16 = x3[0U]; + uint64_t uu____17 = x3[1U]; + uint64_t uu____18 = y3[0U]; + uint64_t uu____19 = y3[1U]; + uint64_t uu____20 = y3[0U] ^ y3[1U]; + uint64_t uu____21 = yr3[0U]; + uint64_t uu____22 = yr3[1U]; + uint64_t uu____23 = yr3[0U] ^ yr3[1U]; + uint64_t + x510 = + (uu____16 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____16 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x69 = + (x510 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x510 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x79 = + (x69 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x69 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x89 = + (x79 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x79 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x99 = + (x89 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x89 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y1r1 = x99 << (uint32_t)32U | x99 >> (uint32_t)32U; + uint64_t + x511 = + (uu____17 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____17 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x610 = + (x511 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x511 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x710 = + (x610 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x610 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x810 = + (x710 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x710 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x910 = + (x810 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x810 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y2r1 = x910 << (uint32_t)32U | x910 >> (uint32_t)32U; + uint64_t y312 = uu____16 ^ uu____17; + uint64_t y3r2 = y1r1 ^ y2r1; + uint64_t x012 = uu____16 & (uint64_t)0x1111111111111111U; + uint64_t x1112 = uu____16 & (uint64_t)0x2222222222222222U; + uint64_t x2112 = uu____16 & (uint64_t)0x4444444444444444U; + uint64_t x3112 = uu____16 & (uint64_t)0x8888888888888888U; + uint64_t y012 = uu____18 & (uint64_t)0x1111111111111111U; + uint64_t y1112 = uu____18 & (uint64_t)0x2222222222222222U; + uint64_t y2112 = uu____18 & (uint64_t)0x4444444444444444U; + uint64_t y3212 = uu____18 & (uint64_t)0x8888888888888888U; + uint64_t z05 = x012 * y012 ^ (x1112 * y3212 ^ (x2112 * y2112 ^ x3112 * y1112)); + uint64_t z118 = x012 * y1112 ^ (x1112 * y012 ^ (x2112 * y3212 ^ x3112 * y2112)); + uint64_t z219 = x012 * y2112 ^ (x1112 * y1112 ^ (x2112 * y012 ^ x3112 * y3212)); + uint64_t z314 = x012 * y3212 ^ (x1112 * y2112 ^ (x2112 * y1112 ^ x3112 * y012)); + uint64_t + z06 = + (((z05 & (uint64_t)0x1111111111111111U) | (z118 & (uint64_t)0x2222222222222222U)) + | (z219 & (uint64_t)0x4444444444444444U)) + | (z314 & (uint64_t)0x8888888888888888U); + uint64_t x013 = uu____17 & (uint64_t)0x1111111111111111U; + uint64_t x1113 = uu____17 & (uint64_t)0x2222222222222222U; + uint64_t x2113 = uu____17 & (uint64_t)0x4444444444444444U; + uint64_t x3113 = uu____17 & (uint64_t)0x8888888888888888U; + uint64_t y013 = uu____19 & (uint64_t)0x1111111111111111U; + uint64_t y1113 = uu____19 & (uint64_t)0x2222222222222222U; + uint64_t y2113 = uu____19 & (uint64_t)0x4444444444444444U; + uint64_t y3213 = uu____19 & (uint64_t)0x8888888888888888U; + uint64_t z0110 = x013 * y013 ^ (x1113 * y3213 ^ (x2113 * y2113 ^ x3113 * y1113)); + uint64_t z119 = x013 * y1113 ^ (x1113 * y013 ^ (x2113 * y3213 ^ x3113 * y2113)); + uint64_t z2110 = x013 * y2113 ^ (x1113 * y1113 ^ (x2113 * y013 ^ x3113 * y3213)); + uint64_t z315 = x013 * y3213 ^ (x1113 * y2113 ^ (x2113 * y1113 ^ x3113 * y013)); + uint64_t + z1110 = + (((z0110 & (uint64_t)0x1111111111111111U) | (z119 & (uint64_t)0x2222222222222222U)) + | (z2110 & (uint64_t)0x4444444444444444U)) + | (z315 & (uint64_t)0x8888888888888888U); + uint64_t x014 = y312 & (uint64_t)0x1111111111111111U; + uint64_t x1114 = y312 & (uint64_t)0x2222222222222222U; + uint64_t x2114 = y312 & (uint64_t)0x4444444444444444U; + uint64_t x3114 = y312 & (uint64_t)0x8888888888888888U; + uint64_t y014 = uu____20 & (uint64_t)0x1111111111111111U; + uint64_t y1114 = uu____20 & (uint64_t)0x2222222222222222U; + uint64_t y2114 = uu____20 & (uint64_t)0x4444444444444444U; + uint64_t y3214 = uu____20 & (uint64_t)0x8888888888888888U; + uint64_t z0111 = x014 * y014 ^ (x1114 * y3214 ^ (x2114 * y2114 ^ x3114 * y1114)); + uint64_t z120 = x014 * y1114 ^ (x1114 * y014 ^ (x2114 * y3214 ^ x3114 * y2114)); + uint64_t z2111 = x014 * y2114 ^ (x1114 * y1114 ^ (x2114 * y014 ^ x3114 * y3214)); + uint64_t z316 = x014 * y3214 ^ (x1114 * y2114 ^ (x2114 * y1114 ^ x3114 * y014)); + uint64_t + z2112 = + (((z0111 & (uint64_t)0x1111111111111111U) | (z120 & (uint64_t)0x2222222222222222U)) + | (z2111 & (uint64_t)0x4444444444444444U)) + | (z316 & (uint64_t)0x8888888888888888U); + uint64_t x015 = y1r1 & (uint64_t)0x1111111111111111U; + uint64_t x1115 = y1r1 & (uint64_t)0x2222222222222222U; + uint64_t x2115 = y1r1 & (uint64_t)0x4444444444444444U; + uint64_t x3115 = y1r1 & (uint64_t)0x8888888888888888U; + uint64_t y015 = uu____21 & (uint64_t)0x1111111111111111U; + uint64_t y1115 = uu____21 & (uint64_t)0x2222222222222222U; + uint64_t y2115 = uu____21 & (uint64_t)0x4444444444444444U; + uint64_t y3215 = uu____21 & (uint64_t)0x8888888888888888U; + uint64_t z0112 = x015 * y015 ^ (x1115 * y3215 ^ (x2115 * y2115 ^ x3115 * y1115)); + uint64_t z121 = x015 * y1115 ^ (x1115 * y015 ^ (x2115 * y3215 ^ x3115 * y2115)); + uint64_t z220 = x015 * y2115 ^ (x1115 * y1115 ^ (x2115 * y015 ^ x3115 * y3215)); + uint64_t z317 = x015 * y3215 ^ (x1115 * y2115 ^ (x2115 * y1115 ^ x3115 * y015)); + uint64_t + z0h2 = + (((z0112 & (uint64_t)0x1111111111111111U) | (z121 & (uint64_t)0x2222222222222222U)) + | (z220 & (uint64_t)0x4444444444444444U)) + | (z317 & (uint64_t)0x8888888888888888U); + uint64_t x016 = y2r1 & (uint64_t)0x1111111111111111U; + uint64_t x1116 = y2r1 & (uint64_t)0x2222222222222222U; + uint64_t x2116 = y2r1 & (uint64_t)0x4444444444444444U; + uint64_t x3116 = y2r1 & (uint64_t)0x8888888888888888U; + uint64_t y016 = uu____22 & (uint64_t)0x1111111111111111U; + uint64_t y1116 = uu____22 & (uint64_t)0x2222222222222222U; + uint64_t y2116 = uu____22 & (uint64_t)0x4444444444444444U; + uint64_t y3216 = uu____22 & (uint64_t)0x8888888888888888U; + uint64_t z0113 = x016 * y016 ^ (x1116 * y3216 ^ (x2116 * y2116 ^ x3116 * y1116)); + uint64_t z122 = x016 * y1116 ^ (x1116 * y016 ^ (x2116 * y3216 ^ x3116 * y2116)); + uint64_t z221 = x016 * y2116 ^ (x1116 * y1116 ^ (x2116 * y016 ^ x3116 * y3216)); + uint64_t z318 = x016 * y3216 ^ (x1116 * y2116 ^ (x2116 * y1116 ^ x3116 * y016)); + uint64_t + z1h2 = + (((z0113 & (uint64_t)0x1111111111111111U) | (z122 & (uint64_t)0x2222222222222222U)) + | (z221 & (uint64_t)0x4444444444444444U)) + | (z318 & (uint64_t)0x8888888888888888U); + uint64_t x017 = y3r2 & (uint64_t)0x1111111111111111U; + uint64_t x1117 = y3r2 & (uint64_t)0x2222222222222222U; + uint64_t x2117 = y3r2 & (uint64_t)0x4444444444444444U; + uint64_t x3117 = y3r2 & (uint64_t)0x8888888888888888U; + uint64_t y017 = uu____23 & (uint64_t)0x1111111111111111U; + uint64_t y1117 = uu____23 & (uint64_t)0x2222222222222222U; + uint64_t y2117 = uu____23 & (uint64_t)0x4444444444444444U; + uint64_t y3217 = uu____23 & (uint64_t)0x8888888888888888U; + uint64_t z0114 = x017 * y017 ^ (x1117 * y3217 ^ (x2117 * y2117 ^ x3117 * y1117)); + uint64_t z123 = x017 * y1117 ^ (x1117 * y017 ^ (x2117 * y3217 ^ x3117 * y2117)); + uint64_t z222 = x017 * y2117 ^ (x1117 * y1117 ^ (x2117 * y017 ^ x3117 * y3217)); + uint64_t z319 = x017 * y3217 ^ (x1117 * y2117 ^ (x2117 * y1117 ^ x3117 * y017)); + uint64_t + z2h3 = + (((z0114 & (uint64_t)0x1111111111111111U) | (z123 & (uint64_t)0x2222222222222222U)) + | (z222 & (uint64_t)0x4444444444444444U)) + | (z319 & (uint64_t)0x8888888888888888U); + uint64_t z223 = z2112 ^ (z06 ^ z1110); + uint64_t z2h12 = z2h3 ^ (z0h2 ^ z1h2); + uint64_t + x512 = + (z0h2 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z0h2 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x611 = + (x512 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x512 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x711 = + (x611 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x611 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x811 = + (x711 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x711 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x911 = + (x811 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x811 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z0h11 = (x911 << (uint32_t)32U | x911 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x513 = + (z1h2 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z1h2 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x612 = + (x513 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x513 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x712 = + (x612 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x612 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x812 = + (x712 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x712 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x912 = + (x812 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x812 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z1h11 = (x912 << (uint32_t)32U | x912 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x514 = + (z2h12 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z2h12 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x613 = + (x514 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x514 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x713 = + (x613 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x613 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x813 = + (x713 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x713 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x913 = + (x813 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x813 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z2h21 = (x913 << (uint32_t)32U | x913 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t z3_1 = z06; + uint64_t z3_2 = z0h11 ^ z223; + uint64_t z3_3 = z1110 ^ z2h21; + uint64_t z3_4 = z1h11; + uint64_t z11 = z1 ^ z3_1; + uint64_t z21 = z2 ^ z3_2; + uint64_t z31 = z3 ^ z3_3; + uint64_t z41 = z4 ^ z3_4; + uint64_t uu____24 = x4[0U]; + uint64_t uu____25 = x4[1U]; + uint64_t uu____26 = y4[0U]; + uint64_t uu____27 = y4[1U]; + uint64_t uu____28 = y4[0U] ^ y4[1U]; + uint64_t uu____29 = yr4[0U]; + uint64_t uu____30 = yr4[1U]; + uint64_t uu____31 = yr4[0U] ^ yr4[1U]; + uint64_t + x515 = + (uu____24 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____24 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x614 = + (x515 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x515 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x714 = + (x614 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x614 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x814 = + (x714 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x714 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x914 = + (x814 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x814 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y1r2 = x914 << (uint32_t)32U | x914 >> (uint32_t)32U; + uint64_t + x516 = + (uu____25 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____25 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x615 = + (x516 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x516 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x715 = + (x615 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x615 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x815 = + (x715 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x715 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x915 = + (x815 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x815 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y2r2 = x915 << (uint32_t)32U | x915 >> (uint32_t)32U; + uint64_t y31 = uu____24 ^ uu____25; + uint64_t y3r = y1r2 ^ y2r2; + uint64_t x018 = uu____24 & (uint64_t)0x1111111111111111U; + uint64_t x1118 = uu____24 & (uint64_t)0x2222222222222222U; + uint64_t x2118 = uu____24 & (uint64_t)0x4444444444444444U; + uint64_t x3118 = uu____24 & (uint64_t)0x8888888888888888U; + uint64_t y018 = uu____26 & (uint64_t)0x1111111111111111U; + uint64_t y1118 = uu____26 & (uint64_t)0x2222222222222222U; + uint64_t y2118 = uu____26 & (uint64_t)0x4444444444444444U; + uint64_t y3218 = uu____26 & (uint64_t)0x8888888888888888U; + uint64_t z0 = x018 * y018 ^ (x1118 * y3218 ^ (x2118 * y2118 ^ x3118 * y1118)); + uint64_t z124 = x018 * y1118 ^ (x1118 * y018 ^ (x2118 * y3218 ^ x3118 * y2118)); + uint64_t z224 = x018 * y2118 ^ (x1118 * y1118 ^ (x2118 * y018 ^ x3118 * y3218)); + uint64_t z320 = x018 * y3218 ^ (x1118 * y2118 ^ (x2118 * y1118 ^ x3118 * y018)); + uint64_t + z07 = + (((z0 & (uint64_t)0x1111111111111111U) | (z124 & (uint64_t)0x2222222222222222U)) + | (z224 & (uint64_t)0x4444444444444444U)) + | (z320 & (uint64_t)0x8888888888888888U); + uint64_t x019 = uu____25 & (uint64_t)0x1111111111111111U; + uint64_t x1119 = uu____25 & (uint64_t)0x2222222222222222U; + uint64_t x2119 = uu____25 & (uint64_t)0x4444444444444444U; + uint64_t x3119 = uu____25 & (uint64_t)0x8888888888888888U; + uint64_t y019 = uu____27 & (uint64_t)0x1111111111111111U; + uint64_t y1119 = uu____27 & (uint64_t)0x2222222222222222U; + uint64_t y2119 = uu____27 & (uint64_t)0x4444444444444444U; + uint64_t y3219 = uu____27 & (uint64_t)0x8888888888888888U; + uint64_t z0115 = x019 * y019 ^ (x1119 * y3219 ^ (x2119 * y2119 ^ x3119 * y1119)); + uint64_t z125 = x019 * y1119 ^ (x1119 * y019 ^ (x2119 * y3219 ^ x3119 * y2119)); + uint64_t z225 = x019 * y2119 ^ (x1119 * y1119 ^ (x2119 * y019 ^ x3119 * y3219)); + uint64_t z321 = x019 * y3219 ^ (x1119 * y2119 ^ (x2119 * y1119 ^ x3119 * y019)); + uint64_t + z126 = + (((z0115 & (uint64_t)0x1111111111111111U) | (z125 & (uint64_t)0x2222222222222222U)) + | (z225 & (uint64_t)0x4444444444444444U)) + | (z321 & (uint64_t)0x8888888888888888U); + uint64_t x020 = y31 & (uint64_t)0x1111111111111111U; + uint64_t x1120 = y31 & (uint64_t)0x2222222222222222U; + uint64_t x2120 = y31 & (uint64_t)0x4444444444444444U; + uint64_t x3120 = y31 & (uint64_t)0x8888888888888888U; + uint64_t y020 = uu____28 & (uint64_t)0x1111111111111111U; + uint64_t y1120 = uu____28 & (uint64_t)0x2222222222222222U; + uint64_t y2120 = uu____28 & (uint64_t)0x4444444444444444U; + uint64_t y3220 = uu____28 & (uint64_t)0x8888888888888888U; + uint64_t z0116 = x020 * y020 ^ (x1120 * y3220 ^ (x2120 * y2120 ^ x3120 * y1120)); + uint64_t z130 = x020 * y1120 ^ (x1120 * y020 ^ (x2120 * y3220 ^ x3120 * y2120)); + uint64_t z226 = x020 * y2120 ^ (x1120 * y1120 ^ (x2120 * y020 ^ x3120 * y3220)); + uint64_t z322 = x020 * y3220 ^ (x1120 * y2120 ^ (x2120 * y1120 ^ x3120 * y020)); + uint64_t + z227 = + (((z0116 & (uint64_t)0x1111111111111111U) | (z130 & (uint64_t)0x2222222222222222U)) + | (z226 & (uint64_t)0x4444444444444444U)) + | (z322 & (uint64_t)0x8888888888888888U); + uint64_t x021 = y1r2 & (uint64_t)0x1111111111111111U; + uint64_t x1121 = y1r2 & (uint64_t)0x2222222222222222U; + uint64_t x2121 = y1r2 & (uint64_t)0x4444444444444444U; + uint64_t x3121 = y1r2 & (uint64_t)0x8888888888888888U; + uint64_t y021 = uu____29 & (uint64_t)0x1111111111111111U; + uint64_t y1121 = uu____29 & (uint64_t)0x2222222222222222U; + uint64_t y2121 = uu____29 & (uint64_t)0x4444444444444444U; + uint64_t y3221 = uu____29 & (uint64_t)0x8888888888888888U; + uint64_t z0117 = x021 * y021 ^ (x1121 * y3221 ^ (x2121 * y2121 ^ x3121 * y1121)); + uint64_t z131 = x021 * y1121 ^ (x1121 * y021 ^ (x2121 * y3221 ^ x3121 * y2121)); + uint64_t z230 = x021 * y2121 ^ (x1121 * y1121 ^ (x2121 * y021 ^ x3121 * y3221)); + uint64_t z323 = x021 * y3221 ^ (x1121 * y2121 ^ (x2121 * y1121 ^ x3121 * y021)); + uint64_t + z0h3 = + (((z0117 & (uint64_t)0x1111111111111111U) | (z131 & (uint64_t)0x2222222222222222U)) + | (z230 & (uint64_t)0x4444444444444444U)) + | (z323 & (uint64_t)0x8888888888888888U); + uint64_t x022 = y2r2 & (uint64_t)0x1111111111111111U; + uint64_t x1122 = y2r2 & (uint64_t)0x2222222222222222U; + uint64_t x2122 = y2r2 & (uint64_t)0x4444444444444444U; + uint64_t x3122 = y2r2 & (uint64_t)0x8888888888888888U; + uint64_t y022 = uu____30 & (uint64_t)0x1111111111111111U; + uint64_t y1122 = uu____30 & (uint64_t)0x2222222222222222U; + uint64_t y2122 = uu____30 & (uint64_t)0x4444444444444444U; + uint64_t y3222 = uu____30 & (uint64_t)0x8888888888888888U; + uint64_t z0118 = x022 * y022 ^ (x1122 * y3222 ^ (x2122 * y2122 ^ x3122 * y1122)); + uint64_t z132 = x022 * y1122 ^ (x1122 * y022 ^ (x2122 * y3222 ^ x3122 * y2122)); + uint64_t z231 = x022 * y2122 ^ (x1122 * y1122 ^ (x2122 * y022 ^ x3122 * y3222)); + uint64_t z324 = x022 * y3222 ^ (x1122 * y2122 ^ (x2122 * y1122 ^ x3122 * y022)); + uint64_t + z1h3 = + (((z0118 & (uint64_t)0x1111111111111111U) | (z132 & (uint64_t)0x2222222222222222U)) + | (z231 & (uint64_t)0x4444444444444444U)) + | (z324 & (uint64_t)0x8888888888888888U); + uint64_t x0 = y3r & (uint64_t)0x1111111111111111U; + uint64_t x11 = y3r & (uint64_t)0x2222222222222222U; + uint64_t x21 = y3r & (uint64_t)0x4444444444444444U; + uint64_t x31 = y3r & (uint64_t)0x8888888888888888U; + uint64_t y0 = uu____31 & (uint64_t)0x1111111111111111U; + uint64_t y11 = uu____31 & (uint64_t)0x2222222222222222U; + uint64_t y21 = uu____31 & (uint64_t)0x4444444444444444U; + uint64_t y32 = uu____31 & (uint64_t)0x8888888888888888U; + uint64_t z01 = x0 * y0 ^ (x11 * y32 ^ (x21 * y21 ^ x31 * y11)); + uint64_t z13 = x0 * y11 ^ (x11 * y0 ^ (x21 * y32 ^ x31 * y21)); + uint64_t z232 = x0 * y21 ^ (x11 * y11 ^ (x21 * y0 ^ x31 * y32)); + uint64_t z325 = x0 * y32 ^ (x11 * y21 ^ (x21 * y11 ^ x31 * y0)); + uint64_t + z2h4 = + (((z01 & (uint64_t)0x1111111111111111U) | (z13 & (uint64_t)0x2222222222222222U)) + | (z232 & (uint64_t)0x4444444444444444U)) + | (z325 & (uint64_t)0x8888888888888888U); + uint64_t z23 = z227 ^ (z07 ^ z126); + uint64_t z2h1 = z2h4 ^ (z0h3 ^ z1h3); + uint64_t + x517 = + (z0h3 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z0h3 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x616 = + (x517 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x517 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x716 = + (x616 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x616 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x816 = + (x716 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x716 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x916 = + (x816 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x816 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z0h12 = (x916 << (uint32_t)32U | x916 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x518 = + (z1h3 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z1h3 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x617 = + (x518 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x518 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x717 = + (x617 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x617 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x817 = + (x717 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x717 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x917 = + (x817 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x817 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z1h12 = (x917 << (uint32_t)32U | x917 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x5 = + (z2h1 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z2h1 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x618 = + (x5 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x5 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x718 = + (x618 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x618 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x818 = + (x718 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x718 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x918 = + (x818 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x818 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z2h22 = (x918 << (uint32_t)32U | x918 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t z4_1 = z07; + uint64_t z4_2 = z0h12 ^ z23; + uint64_t z4_3 = z126 ^ z2h22; + uint64_t z4_4 = z1h12; + uint64_t z12 = z11 ^ z4_1; + uint64_t z22 = z21 ^ z4_2; + uint64_t z32 = z31 ^ z4_3; + uint64_t z42 = z41 ^ z4_4; + uint64_t v3 = z42 << (uint32_t)1U | z32 >> (uint32_t)63U; + uint64_t v20 = z32 << (uint32_t)1U | z22 >> (uint32_t)63U; + uint64_t v1 = z22 << (uint32_t)1U | z12 >> (uint32_t)63U; + uint64_t v0 = z12 << (uint32_t)1U; + uint64_t v21 = v20 ^ (v0 ^ (v0 >> (uint32_t)1U ^ (v0 >> (uint32_t)2U ^ v0 >> (uint32_t)7U))); + uint64_t v11 = v1 ^ (v0 << (uint32_t)63U ^ (v0 << (uint32_t)62U ^ v0 << (uint32_t)57U)); + uint64_t + v31 = v3 ^ (v11 ^ (v11 >> (uint32_t)1U ^ (v11 >> (uint32_t)2U ^ v11 >> (uint32_t)7U))); + uint64_t v22 = v21 ^ (v11 << (uint32_t)63U ^ (v11 << (uint32_t)62U ^ v11 << (uint32_t)57U)); + uint64_t v10 = v22; + uint64_t v2 = v31; + acc[0U] = v10; + acc[1U] = v2; +} + +void Hacl_Gf128_CT64_gcm_init(uint64_t *ctx, uint8_t *key) +{ + uint64_t *acc = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + acc[0U] = (uint64_t)0U; + acc[1U] = (uint64_t)0U; + load_precompute_r(pre, key); +} + +void Hacl_Gf128_CT64_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text) +{ + uint64_t *acc = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + uint64_t f[8U] = { 0U }; + uint64_t *b4 = f; + uint32_t nb = len0 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t0 + i * (uint32_t)64U; + uint64_t *x0 = b4; + uint8_t *y0 = tb; + uint64_t *x1 = b4 + (uint32_t)2U; + uint8_t *y1 = tb + (uint32_t)16U; + uint64_t *x2 = b4 + (uint32_t)4U; + uint8_t *y2 = tb + (uint32_t)32U; + uint64_t *x3 = b4 + (uint32_t)6U; + uint8_t *y3 = tb + (uint32_t)48U; + uint64_t u = load64_be(y0); + x0[1U] = u; + uint64_t u0 = load64_be(y0 + (uint32_t)8U); + x0[0U] = u0; + uint64_t u1 = load64_be(y1); + x1[1U] = u1; + uint64_t u2 = load64_be(y1 + (uint32_t)8U); + x1[0U] = u2; + uint64_t u3 = load64_be(y2); + x2[1U] = u3; + uint64_t u4 = load64_be(y2 + (uint32_t)8U); + x2[0U] = u4; + uint64_t u5 = load64_be(y3); + x3[1U] = u5; + uint64_t u6 = load64_be(y3 + (uint32_t)8U); + x3[0U] = u6; + uint64_t *uu____0 = b4; + uu____0[0U] = uu____0[0U] ^ acc[0U]; + uu____0[1U] = uu____0[1U] ^ acc[1U]; + normalize4(acc, b4, pre); + } + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + uint64_t *r1 = pre + (uint32_t)6U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint64_t u = load64_be(tb); + elem[1U] = u; + uint64_t u0 = load64_be(tb + (uint32_t)8U); + elem[0U] = u0; + acc[0U] = acc[0U] ^ elem[0U]; + acc[1U] = acc[1U] ^ elem[1U]; + fmul0(acc, r1); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + uint64_t u = load64_be(b); + elem[1U] = u; + uint64_t u0 = load64_be(b + (uint32_t)8U); + elem[0U] = u0; + acc[0U] = acc[0U] ^ elem[0U]; + acc[1U] = acc[1U] ^ elem[1U]; + fmul0(acc, r1); + return; + } +} + +void +(*Hacl_Gf128_CT64_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2) = + Hacl_Gf128_CT64_gcm_update_blocks; + +void Hacl_Gf128_CT64_gcm_emit(uint8_t *tag, uint64_t *ctx) +{ + uint64_t *acc = ctx; + uint64_t r0 = acc[1U]; + uint64_t r1 = acc[0U]; + store64_be(tag, r0); + store64_be(tag + (uint32_t)8U, r1); +} + +void Hacl_Gf128_CT64_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key) +{ + uint64_t ctx[18U] = { 0U }; + uint64_t *acc = ctx; + uint64_t *pre0 = ctx + (uint32_t)2U; + acc[0U] = (uint64_t)0U; + acc[1U] = (uint64_t)0U; + load_precompute_r(pre0, key); + uint64_t *acc0 = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + uint64_t f[8U] = { 0U }; + uint64_t *b4 = f; + uint32_t nb = len0 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t0 + i * (uint32_t)64U; + uint64_t *x0 = b4; + uint8_t *y0 = tb; + uint64_t *x1 = b4 + (uint32_t)2U; + uint8_t *y1 = tb + (uint32_t)16U; + uint64_t *x2 = b4 + (uint32_t)4U; + uint8_t *y2 = tb + (uint32_t)32U; + uint64_t *x3 = b4 + (uint32_t)6U; + uint8_t *y3 = tb + (uint32_t)48U; + uint64_t u = load64_be(y0); + x0[1U] = u; + uint64_t u0 = load64_be(y0 + (uint32_t)8U); + x0[0U] = u0; + uint64_t u1 = load64_be(y1); + x1[1U] = u1; + uint64_t u2 = load64_be(y1 + (uint32_t)8U); + x1[0U] = u2; + uint64_t u3 = load64_be(y2); + x2[1U] = u3; + uint64_t u4 = load64_be(y2 + (uint32_t)8U); + x2[0U] = u4; + uint64_t u5 = load64_be(y3); + x3[1U] = u5; + uint64_t u6 = load64_be(y3 + (uint32_t)8U); + x3[0U] = u6; + uint64_t *uu____0 = b4; + uu____0[0U] = uu____0[0U] ^ acc0[0U]; + uu____0[1U] = uu____0[1U] ^ acc0[1U]; + normalize4(acc0, b4, pre); + } + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + uint64_t *r10 = pre + (uint32_t)6U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint64_t u = load64_be(tb); + elem[1U] = u; + uint64_t u0 = load64_be(tb + (uint32_t)8U); + elem[0U] = u0; + acc0[0U] = acc0[0U] ^ elem[0U]; + acc0[1U] = acc0[1U] ^ elem[1U]; + fmul0(acc0, r10); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + uint64_t u = load64_be(b); + elem[1U] = u; + uint64_t u0 = load64_be(b + (uint32_t)8U); + elem[0U] = u0; + acc0[0U] = acc0[0U] ^ elem[0U]; + acc0[1U] = acc0[1U] ^ elem[1U]; + fmul0(acc0, r10); + } + uint64_t *acc1 = ctx; + uint64_t r0 = acc1[1U]; + uint64_t r1 = acc1[0U]; + store64_be(tag, r0); + store64_be(tag + (uint32_t)8U, r1); +} + diff --git a/src/Hacl_Gf128_PreComp.c b/src/Hacl_Gf128_PreComp.c deleted file mode 100644 index fa12b870..00000000 --- a/src/Hacl_Gf128_PreComp.c +++ /dev/null @@ -1,461 +0,0 @@ -/* MIT License - * - * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation - * Copyright (c) 2022-2023 HACL* Contributors - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - - -#include "Hacl_Gf128_PreComp.h" - -void Hacl_Impl_Gf128_FieldPreComp_fmul(uint64_t *x, uint64_t *y) -{ - uint64_t res[2U] = { 0U }; - uint64_t y_[2U] = { 0U }; - y_[0U] = y[0U]; - y_[1U] = y[1U]; - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - uint64_t m = (uint64_t)0U - (x[1U] >> ((uint32_t)63U - i) & (uint64_t)1U); - res[0U] = res[0U] ^ (y_[0U] & m); - res[1U] = res[1U] ^ (y_[1U] & m); - uint64_t m0 = (uint64_t)0U - (y_[0U] & (uint64_t)1U); - y_[0U] = y_[0U] >> (uint32_t)1U | y_[1U] << (uint32_t)63U; - y_[1U] = y_[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m0); - } - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - uint64_t m = (uint64_t)0U - (x[0U] >> ((uint32_t)63U - i) & (uint64_t)1U); - res[0U] = res[0U] ^ (y_[0U] & m); - res[1U] = res[1U] ^ (y_[1U] & m); - uint64_t m0 = (uint64_t)0U - (y_[0U] & (uint64_t)1U); - y_[0U] = y_[0U] >> (uint32_t)1U | y_[1U] << (uint32_t)63U; - y_[1U] = y_[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m0); - } - x[0U] = res[0U]; - x[1U] = res[1U]; -} - -static inline void prepare(uint64_t *pre, uint64_t *r) -{ - memset(pre, 0U, (uint32_t)256U * sizeof (uint64_t)); - uint64_t sh[2U] = { 0U }; - sh[0U] = r[0U]; - sh[1U] = r[1U]; - uint64_t *pre1 = pre; - uint64_t *pre2 = pre + (uint32_t)128U; - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - memcpy(pre1 + (uint32_t)2U * i, sh, (uint32_t)2U * sizeof (uint64_t)); - uint64_t m = (uint64_t)0U - (sh[0U] & (uint64_t)1U); - sh[0U] = sh[0U] >> (uint32_t)1U | sh[1U] << (uint32_t)63U; - sh[1U] = sh[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m); - } - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - memcpy(pre2 + (uint32_t)2U * i, sh, (uint32_t)2U * sizeof (uint64_t)); - uint64_t m = (uint64_t)0U - (sh[0U] & (uint64_t)1U); - sh[0U] = sh[0U] >> (uint32_t)1U | sh[1U] << (uint32_t)63U; - sh[1U] = sh[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m); - } -} - -void Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(uint64_t *pre, uint8_t *key) -{ - uint64_t *r4321 = pre; - uint64_t *r1 = r4321 + (uint32_t)6U; - uint64_t *r2 = r4321 + (uint32_t)4U; - uint64_t *r3 = r4321 + (uint32_t)2U; - uint64_t *r4 = r4321; - uint64_t *table2 = pre + (uint32_t)8U; - uint64_t u = load64_be(key); - r1[1U] = u; - uint64_t u0 = load64_be(key + (uint32_t)8U); - r1[0U] = u0; - r4[0U] = r1[0U]; - r4[1U] = r1[1U]; - r3[0U] = r1[0U]; - r3[1U] = r1[1U]; - r2[0U] = r1[0U]; - r2[1U] = r1[1U]; - Hacl_Impl_Gf128_FieldPreComp_fmul(r2, r1); - Hacl_Impl_Gf128_FieldPreComp_fmul(r3, r2); - Hacl_Impl_Gf128_FieldPreComp_fmul(r4, r3); - prepare(table2, r4); -} - -static inline void fmul_pre(uint64_t *x, uint64_t *pre) -{ - uint64_t *tab = pre + (uint32_t)8U; - uint64_t tmp[2U] = { 0U }; - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - uint64_t *uu____0 = tab + (uint32_t)2U * i; - uint64_t m = (uint64_t)0U - (x[1U] >> ((uint32_t)63U - i) & (uint64_t)1U); - tmp[0U] = tmp[0U] ^ (uu____0[0U] & m); - tmp[1U] = tmp[1U] ^ (uu____0[1U] & m); - } - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - uint64_t *uu____1 = tab + (uint32_t)128U + (uint32_t)2U * i; - uint64_t m = (uint64_t)0U - (x[0U] >> ((uint32_t)63U - i) & (uint64_t)1U); - tmp[0U] = tmp[0U] ^ (uu____1[0U] & m); - tmp[1U] = tmp[1U] ^ (uu____1[1U] & m); - } - x[0U] = tmp[0U]; - x[1U] = tmp[1U]; -} - -void Hacl_Impl_Gf128_FieldPreComp_fmul_r4(uint64_t *x, uint64_t *pre) -{ - fmul_pre(x, pre); - fmul_pre(x + (uint32_t)2U, pre); - fmul_pre(x + (uint32_t)4U, pre); - fmul_pre(x + (uint32_t)6U, pre); -} - -void Hacl_Impl_Gf128_FieldPreComp_normalize4(uint64_t *acc, uint64_t *x, uint64_t *pre) -{ - uint64_t *x1 = x; - uint64_t *x2 = x + (uint32_t)2U; - uint64_t *x3 = x + (uint32_t)4U; - uint64_t *x4 = x + (uint32_t)6U; - fmul_pre(x, pre); - Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)2U, pre + (uint32_t)2U); - Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)4U, pre + (uint32_t)4U); - Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)6U, pre + (uint32_t)6U); - acc[0U] = x1[0U]; - acc[1U] = x1[1U]; - acc[0U] = acc[0U] ^ x2[0U]; - acc[1U] = acc[1U] ^ x2[1U]; - acc[0U] = acc[0U] ^ x3[0U]; - acc[1U] = acc[1U] ^ x3[1U]; - acc[0U] = acc[0U] ^ x4[0U]; - acc[1U] = acc[1U] ^ x4[1U]; -} - -void Hacl_Gf128_PreComp_gcm_init(uint64_t *ctx, uint8_t *key) -{ - uint64_t *acc = ctx; - uint64_t *pre = ctx + (uint32_t)2U; - acc[0U] = (uint64_t)0U; - acc[1U] = (uint64_t)0U; - Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(pre, key); -} - -void Hacl_Gf128_PreComp_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text) -{ - uint64_t *acc = ctx; - uint64_t *pre = ctx + (uint32_t)2U; - uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; - uint8_t *t0 = text; - if (len0 > (uint32_t)0U) - { - uint64_t f0[8U] = { 0U }; - uint64_t *b4 = f0; - uint64_t f[8U] = { 0U }; - uint64_t *acc4 = f; - uint8_t *tb = t0; - memcpy(acc4, acc, (uint32_t)2U * sizeof (uint64_t)); - uint64_t *x00 = b4; - uint8_t *y00 = tb; - uint64_t *x10 = b4 + (uint32_t)2U; - uint8_t *y10 = tb + (uint32_t)16U; - uint64_t *x20 = b4 + (uint32_t)4U; - uint8_t *y20 = tb + (uint32_t)32U; - uint64_t *x30 = b4 + (uint32_t)6U; - uint8_t *y30 = tb + (uint32_t)48U; - uint64_t u0 = load64_be(y00); - x00[1U] = u0; - uint64_t u1 = load64_be(y00 + (uint32_t)8U); - x00[0U] = u1; - uint64_t u2 = load64_be(y10); - x10[1U] = u2; - uint64_t u3 = load64_be(y10 + (uint32_t)8U); - x10[0U] = u3; - uint64_t u4 = load64_be(y20); - x20[1U] = u4; - uint64_t u5 = load64_be(y20 + (uint32_t)8U); - x20[0U] = u5; - uint64_t u6 = load64_be(y30); - x30[1U] = u6; - uint64_t u7 = load64_be(y30 + (uint32_t)8U); - x30[0U] = u7; - uint64_t *x01 = acc4; - uint64_t *y01 = b4; - uint64_t *x11 = acc4 + (uint32_t)2U; - uint64_t *y11 = b4 + (uint32_t)2U; - uint64_t *x21 = acc4 + (uint32_t)4U; - uint64_t *y21 = b4 + (uint32_t)4U; - uint64_t *x31 = acc4 + (uint32_t)6U; - uint64_t *y31 = b4 + (uint32_t)6U; - x01[0U] = x01[0U] ^ y01[0U]; - x01[1U] = x01[1U] ^ y01[1U]; - x11[0U] = x11[0U] ^ y11[0U]; - x11[1U] = x11[1U] ^ y11[1U]; - x21[0U] = x21[0U] ^ y21[0U]; - x21[1U] = x21[1U] ^ y21[1U]; - x31[0U] = x31[0U] ^ y31[0U]; - x31[1U] = x31[1U] ^ y31[1U]; - uint32_t len1 = len0 - (uint32_t)64U; - uint8_t *text1 = t0 + (uint32_t)64U; - uint32_t nb = len1 / (uint32_t)64U; - for (uint32_t i = (uint32_t)0U; i < nb; i++) - { - uint8_t *tb1 = text1 + i * (uint32_t)64U; - uint64_t *x0 = b4; - uint8_t *y02 = tb1; - uint64_t *x12 = b4 + (uint32_t)2U; - uint8_t *y12 = tb1 + (uint32_t)16U; - uint64_t *x22 = b4 + (uint32_t)4U; - uint8_t *y22 = tb1 + (uint32_t)32U; - uint64_t *x32 = b4 + (uint32_t)6U; - uint8_t *y32 = tb1 + (uint32_t)48U; - uint64_t u = load64_be(y02); - x0[1U] = u; - uint64_t u8 = load64_be(y02 + (uint32_t)8U); - x0[0U] = u8; - uint64_t u9 = load64_be(y12); - x12[1U] = u9; - uint64_t u10 = load64_be(y12 + (uint32_t)8U); - x12[0U] = u10; - uint64_t u11 = load64_be(y22); - x22[1U] = u11; - uint64_t u12 = load64_be(y22 + (uint32_t)8U); - x22[0U] = u12; - uint64_t u13 = load64_be(y32); - x32[1U] = u13; - uint64_t u14 = load64_be(y32 + (uint32_t)8U); - x32[0U] = u14; - Hacl_Impl_Gf128_FieldPreComp_fmul_r4(acc4, pre); - uint64_t *x02 = acc4; - uint64_t *y0 = b4; - uint64_t *x1 = acc4 + (uint32_t)2U; - uint64_t *y1 = b4 + (uint32_t)2U; - uint64_t *x2 = acc4 + (uint32_t)4U; - uint64_t *y2 = b4 + (uint32_t)4U; - uint64_t *x3 = acc4 + (uint32_t)6U; - uint64_t *y3 = b4 + (uint32_t)6U; - x02[0U] = x02[0U] ^ y0[0U]; - x02[1U] = x02[1U] ^ y0[1U]; - x1[0U] = x1[0U] ^ y1[0U]; - x1[1U] = x1[1U] ^ y1[1U]; - x2[0U] = x2[0U] ^ y2[0U]; - x2[1U] = x2[1U] ^ y2[1U]; - x3[0U] = x3[0U] ^ y3[0U]; - x3[1U] = x3[1U] ^ y3[1U]; - } - Hacl_Impl_Gf128_FieldPreComp_normalize4(acc, acc4, pre); - } - uint32_t len1 = len - len0; - uint8_t *t1 = text + len0; - uint64_t *r1 = pre + (uint32_t)6U; - uint32_t nb = len1 / (uint32_t)16U; - uint32_t rem = len1 % (uint32_t)16U; - for (uint32_t i = (uint32_t)0U; i < nb; i++) - { - uint8_t *tb = t1 + i * (uint32_t)16U; - uint64_t elem[2U] = { 0U }; - uint64_t u = load64_be(tb); - elem[1U] = u; - uint64_t u0 = load64_be(tb + (uint32_t)8U); - elem[0U] = u0; - acc[0U] = acc[0U] ^ elem[0U]; - acc[1U] = acc[1U] ^ elem[1U]; - Hacl_Impl_Gf128_FieldPreComp_fmul(acc, r1); - } - if (rem > (uint32_t)0U) - { - uint8_t *last = t1 + nb * (uint32_t)16U; - uint64_t elem[2U] = { 0U }; - uint8_t b[16U] = { 0U }; - memcpy(b, last, rem * sizeof (uint8_t)); - uint64_t u = load64_be(b); - elem[1U] = u; - uint64_t u0 = load64_be(b + (uint32_t)8U); - elem[0U] = u0; - acc[0U] = acc[0U] ^ elem[0U]; - acc[1U] = acc[1U] ^ elem[1U]; - Hacl_Impl_Gf128_FieldPreComp_fmul(acc, r1); - return; - } -} - -void -(*Hacl_Gf128_PreComp_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2) = - Hacl_Gf128_PreComp_gcm_update_blocks; - -void Hacl_Gf128_PreComp_gcm_emit(uint8_t *tag, uint64_t *ctx) -{ - uint64_t *acc = ctx; - uint64_t r0 = acc[1U]; - uint64_t r1 = acc[0U]; - store64_be(tag, r0); - store64_be(tag + (uint32_t)8U, r1); -} - -void Hacl_Gf128_PreComp_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key) -{ - uint64_t ctx[266U] = { 0U }; - uint64_t *acc = ctx; - uint64_t *pre0 = ctx + (uint32_t)2U; - acc[0U] = (uint64_t)0U; - acc[1U] = (uint64_t)0U; - Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(pre0, key); - uint64_t *acc0 = ctx; - uint64_t *pre = ctx + (uint32_t)2U; - uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; - uint8_t *t0 = text; - if (len0 > (uint32_t)0U) - { - uint64_t f0[8U] = { 0U }; - uint64_t *b4 = f0; - uint64_t f[8U] = { 0U }; - uint64_t *acc4 = f; - uint8_t *tb = t0; - memcpy(acc4, acc0, (uint32_t)2U * sizeof (uint64_t)); - uint64_t *x00 = b4; - uint8_t *y00 = tb; - uint64_t *x10 = b4 + (uint32_t)2U; - uint8_t *y10 = tb + (uint32_t)16U; - uint64_t *x20 = b4 + (uint32_t)4U; - uint8_t *y20 = tb + (uint32_t)32U; - uint64_t *x30 = b4 + (uint32_t)6U; - uint8_t *y30 = tb + (uint32_t)48U; - uint64_t u0 = load64_be(y00); - x00[1U] = u0; - uint64_t u1 = load64_be(y00 + (uint32_t)8U); - x00[0U] = u1; - uint64_t u2 = load64_be(y10); - x10[1U] = u2; - uint64_t u3 = load64_be(y10 + (uint32_t)8U); - x10[0U] = u3; - uint64_t u4 = load64_be(y20); - x20[1U] = u4; - uint64_t u5 = load64_be(y20 + (uint32_t)8U); - x20[0U] = u5; - uint64_t u6 = load64_be(y30); - x30[1U] = u6; - uint64_t u7 = load64_be(y30 + (uint32_t)8U); - x30[0U] = u7; - uint64_t *x01 = acc4; - uint64_t *y01 = b4; - uint64_t *x11 = acc4 + (uint32_t)2U; - uint64_t *y11 = b4 + (uint32_t)2U; - uint64_t *x21 = acc4 + (uint32_t)4U; - uint64_t *y21 = b4 + (uint32_t)4U; - uint64_t *x31 = acc4 + (uint32_t)6U; - uint64_t *y31 = b4 + (uint32_t)6U; - x01[0U] = x01[0U] ^ y01[0U]; - x01[1U] = x01[1U] ^ y01[1U]; - x11[0U] = x11[0U] ^ y11[0U]; - x11[1U] = x11[1U] ^ y11[1U]; - x21[0U] = x21[0U] ^ y21[0U]; - x21[1U] = x21[1U] ^ y21[1U]; - x31[0U] = x31[0U] ^ y31[0U]; - x31[1U] = x31[1U] ^ y31[1U]; - uint32_t len1 = len0 - (uint32_t)64U; - uint8_t *text1 = t0 + (uint32_t)64U; - uint32_t nb = len1 / (uint32_t)64U; - for (uint32_t i = (uint32_t)0U; i < nb; i++) - { - uint8_t *tb1 = text1 + i * (uint32_t)64U; - uint64_t *x0 = b4; - uint8_t *y02 = tb1; - uint64_t *x12 = b4 + (uint32_t)2U; - uint8_t *y12 = tb1 + (uint32_t)16U; - uint64_t *x22 = b4 + (uint32_t)4U; - uint8_t *y22 = tb1 + (uint32_t)32U; - uint64_t *x32 = b4 + (uint32_t)6U; - uint8_t *y32 = tb1 + (uint32_t)48U; - uint64_t u = load64_be(y02); - x0[1U] = u; - uint64_t u8 = load64_be(y02 + (uint32_t)8U); - x0[0U] = u8; - uint64_t u9 = load64_be(y12); - x12[1U] = u9; - uint64_t u10 = load64_be(y12 + (uint32_t)8U); - x12[0U] = u10; - uint64_t u11 = load64_be(y22); - x22[1U] = u11; - uint64_t u12 = load64_be(y22 + (uint32_t)8U); - x22[0U] = u12; - uint64_t u13 = load64_be(y32); - x32[1U] = u13; - uint64_t u14 = load64_be(y32 + (uint32_t)8U); - x32[0U] = u14; - Hacl_Impl_Gf128_FieldPreComp_fmul_r4(acc4, pre); - uint64_t *x02 = acc4; - uint64_t *y0 = b4; - uint64_t *x1 = acc4 + (uint32_t)2U; - uint64_t *y1 = b4 + (uint32_t)2U; - uint64_t *x2 = acc4 + (uint32_t)4U; - uint64_t *y2 = b4 + (uint32_t)4U; - uint64_t *x3 = acc4 + (uint32_t)6U; - uint64_t *y3 = b4 + (uint32_t)6U; - x02[0U] = x02[0U] ^ y0[0U]; - x02[1U] = x02[1U] ^ y0[1U]; - x1[0U] = x1[0U] ^ y1[0U]; - x1[1U] = x1[1U] ^ y1[1U]; - x2[0U] = x2[0U] ^ y2[0U]; - x2[1U] = x2[1U] ^ y2[1U]; - x3[0U] = x3[0U] ^ y3[0U]; - x3[1U] = x3[1U] ^ y3[1U]; - } - Hacl_Impl_Gf128_FieldPreComp_normalize4(acc0, acc4, pre); - } - uint32_t len1 = len - len0; - uint8_t *t1 = text + len0; - uint64_t *r10 = pre + (uint32_t)6U; - uint32_t nb = len1 / (uint32_t)16U; - uint32_t rem = len1 % (uint32_t)16U; - for (uint32_t i = (uint32_t)0U; i < nb; i++) - { - uint8_t *tb = t1 + i * (uint32_t)16U; - uint64_t elem[2U] = { 0U }; - uint64_t u = load64_be(tb); - elem[1U] = u; - uint64_t u0 = load64_be(tb + (uint32_t)8U); - elem[0U] = u0; - acc0[0U] = acc0[0U] ^ elem[0U]; - acc0[1U] = acc0[1U] ^ elem[1U]; - Hacl_Impl_Gf128_FieldPreComp_fmul(acc0, r10); - } - if (rem > (uint32_t)0U) - { - uint8_t *last = t1 + nb * (uint32_t)16U; - uint64_t elem[2U] = { 0U }; - uint8_t b[16U] = { 0U }; - memcpy(b, last, rem * sizeof (uint8_t)); - uint64_t u = load64_be(b); - elem[1U] = u; - uint64_t u0 = load64_be(b + (uint32_t)8U); - elem[0U] = u0; - acc0[0U] = acc0[0U] ^ elem[0U]; - acc0[1U] = acc0[1U] ^ elem[1U]; - Hacl_Impl_Gf128_FieldPreComp_fmul(acc0, r10); - } - uint64_t *acc1 = ctx; - uint64_t r0 = acc1[1U]; - uint64_t r1 = acc1[0U]; - store64_be(tag, r0); - store64_be(tag + (uint32_t)8U, r1); -} - diff --git a/src/msvc/EverCrypt_AEAD.c b/src/msvc/EverCrypt_AEAD.c index 6c21c319..86630862 100644 --- a/src/msvc/EverCrypt_AEAD.c +++ b/src/msvc/EverCrypt_AEAD.c @@ -28,8 +28,10 @@ #include "internal/Vale.h" #ifdef HACL_CAN_COMPILE_AESNI_PCLMUL #include "Hacl_AES_128_GCM_NI.h" +#include "Hacl_AES_256_GCM_NI.h" #endif -#include "Hacl_AES_128_GCM_M32.h" +#include "Hacl_AES_128_GCM_CT64.h" +#include "Hacl_AES_256_GCM_CT64.h" #include "internal/Hacl_Spec.h" #include "config.h" #include "hacl-cpu-features.h" @@ -69,13 +71,13 @@ Spec_Agile_AEAD_alg EverCrypt_AEAD_alg_of_state(EverCrypt_AEAD_state_s *s) } case Spec_Cipher_Expansion_Vale_AES128: case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: - case Spec_Cipher_Expansion_M32_AES128: + case Spec_Cipher_Expansion_CT64_AES128: { return Spec_Agile_AEAD_AES128_GCM; } case Spec_Cipher_Expansion_Vale_AES256: case Spec_Cipher_Expansion_AESNI_PCLMUL_AES256: - case Spec_Cipher_Expansion_M32_AES256: + case Spec_Cipher_Expansion_CT64_AES256: { return Spec_Agile_AEAD_AES256_GCM; } @@ -122,10 +124,11 @@ create_in_aes128_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) return EverCrypt_Error_Success; } else - #elif HACL_CAN_COMPILE_AESNI_PCLMUL + #endif + #if HACL_CAN_COMPILE_AESNI_PCLMUL if (hacl_aesgcm_support() != 0) { - uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)352U, sizeof (uint8_t)); + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)288U, sizeof (uint8_t)); Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx, k); EverCrypt_AEAD_state_s @@ -137,12 +140,12 @@ create_in_aes128_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) else #endif { - uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)3168U, sizeof (uint8_t)); + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)928U, sizeof (uint8_t)); uint64_t *aes_gcm_ctx = (uint64_t *)ek; - Hacl_AES_128_GCM_M32_aes128_gcm_init(aes_gcm_ctx, k); + Hacl_AES_128_GCM_CT64_aes128_gcm_init(aes_gcm_ctx, k); EverCrypt_AEAD_state_s *p = (EverCrypt_AEAD_state_s *)KRML_HOST_MALLOC(sizeof (EverCrypt_AEAD_state_s)); - p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_M32_AES128, .ek = ek }); + p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_CT64_AES128, .ek = ek }); *dst = p; return EverCrypt_Error_Success; } @@ -151,12 +154,12 @@ create_in_aes128_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) static EverCrypt_Error_error_code create_in_aes256_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) { + #if HACL_CAN_COMPILE_VALE bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)544U, sizeof (uint8_t)); @@ -170,8 +173,32 @@ create_in_aes256_gcm(EverCrypt_AEAD_state_s **dst, uint8_t *k) *dst = p; return EverCrypt_Error_Success; } + else #endif - return EverCrypt_Error_UnsupportedAlgorithm; + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)352U, sizeof (uint8_t)); + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_256_GCM_NI_aes256_gcm_init(aes_gcm_ctx, k); + EverCrypt_AEAD_state_s + *p = (EverCrypt_AEAD_state_s *)KRML_HOST_MALLOC(sizeof (EverCrypt_AEAD_state_s)); + p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES256, .ek = ek }); + *dst = p; + return EverCrypt_Error_Success; + } + else + #endif + { + uint8_t *ek = (uint8_t *)KRML_HOST_CALLOC((uint32_t)1184U, sizeof (uint8_t)); + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + Hacl_AES_256_GCM_CT64_aes256_gcm_init(aes_gcm_ctx, k); + EverCrypt_AEAD_state_s + *p = (EverCrypt_AEAD_state_s *)KRML_HOST_MALLOC(sizeof (EverCrypt_AEAD_state_s)); + p[0U] = ((EverCrypt_AEAD_state_s){ .impl = Spec_Cipher_Expansion_CT64_AES256, .ek = ek }); + *dst = p; + return EverCrypt_Error_Success; + } } /** @@ -379,7 +406,7 @@ encrypt_aes128_gcm_aesni_pclmul( } static EverCrypt_Error_error_code -encrypt_aes128_gcm_m32( +encrypt_aes128_gcm_ct64( EverCrypt_AEAD_state_s *s, uint8_t *iv, uint32_t iv_len, @@ -403,7 +430,7 @@ encrypt_aes128_gcm_m32( uint8_t *ek = scrut.ek; uint64_t *aes_gcm_ctx = (uint64_t *)ek; uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); - Hacl_AES_128_GCM_M32_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); memcpy(cipher, out, plain_len); memcpy(tag, out + plain_len, 16); KRML_HOST_FREE(out); @@ -534,6 +561,78 @@ encrypt_aes256_gcm( #endif } +static EverCrypt_Error_error_code +encrypt_aes256_gcm_aesni_pclmul( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *plain, + uint32_t plain_len, + uint8_t *cipher, + uint8_t *tag +) +{ + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_NI_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + return EverCrypt_Error_Success; + #else + KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", + __FILE__, + __LINE__, + "statically unreachable"); + KRML_HOST_EXIT(255U); + #endif +} + +static EverCrypt_Error_error_code +encrypt_aes256_gcm_ct64( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *plain, + uint32_t plain_len, + uint8_t *cipher, + uint8_t *tag +) +{ + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_CT64_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + return EverCrypt_Error_Success; +} + /** Encrypt and authenticate a message (`plain`) with associated data (`ad`). @@ -579,10 +678,26 @@ EverCrypt_AEAD_encrypt( { return encrypt_aes128_gcm(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); } + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + { + return encrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } + case Spec_Cipher_Expansion_CT64_AES128: + { + return encrypt_aes128_gcm_ct64(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } case Spec_Cipher_Expansion_Vale_AES256: { return encrypt_aes256_gcm(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); } + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES256: + { + return encrypt_aes256_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } + case Spec_Cipher_Expansion_CT64_AES256: + { + return encrypt_aes256_gcm_ct64(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); + } case Spec_Cipher_Expansion_Hacl_CHACHA20: { if (iv_len != (uint32_t)12U) @@ -592,14 +707,6 @@ EverCrypt_AEAD_encrypt( EverCrypt_Chacha20Poly1305_aead_encrypt(ek, iv, ad_len, ad, plain_len, plain, cipher, tag); return EverCrypt_Error_Success; } - case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: - { - return encrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); - } - case Spec_Cipher_Expansion_M32_AES128: - { - return encrypt_aes128_gcm_m32(s, iv, iv_len, ad, ad_len, plain, plain_len, cipher, tag); - } default: { KRML_HOST_EPRINTF("KaRaMeL incomplete match at %s:%d\n", __FILE__, __LINE__); @@ -609,12 +716,9 @@ EverCrypt_AEAD_encrypt( } /** -WARNING: this function doesn't perform any dynamic - hardware check. You MUST make sure your hardware supports the - implementation of AESGCM. Besides, this function was not designed - for cross-compilation: if you compile it on a system which doesn't - support Vale, it will compile it to a function which makes the - program exit. +WARNING: this function doesn't perform any dynamic hardware + check. You need to configure particular flags to control hardware + fetures that this function uses on target processor architecture. */ EverCrypt_Error_error_code EverCrypt_AEAD_encrypt_expand_aes128_gcm_no_check( @@ -742,23 +846,68 @@ EverCrypt_AEAD_encrypt_expand_aes128_gcm_no_check( (uint32_t)(uint64_t)plain_len % (uint32_t)16U * sizeof (uint8_t)); r = EverCrypt_Error_Success; } - return EverCrypt_Error_Success; + #elif HACL_CAN_COMPILE_AESNI_PCLMUL + uint8_t ek[288U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_NI_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } #else - KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", - __FILE__, - __LINE__, - "EverCrypt was compiled on a system which doesn\'t support Vale"); - KRML_HOST_EXIT(255U); + uint8_t ek[928U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_128_GCM_CT64_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } #endif + return EverCrypt_Error_Success; } /** -WARNING: this function doesn't perform any dynamic - hardware check. You MUST make sure your hardware supports the - implementation of AESGCM. Besides, this function was not designed - for cross-compilation: if you compile it on a system which doesn't - support Vale, it will compile it to a function which makes the - program exit. +WARNING: this function doesn't perform any dynamic hardware + check. You need to configure particular flags to control hardware + fetures that this function uses on target processor architecture. */ EverCrypt_Error_error_code EverCrypt_AEAD_encrypt_expand_aes256_gcm_no_check( @@ -886,14 +1035,62 @@ EverCrypt_AEAD_encrypt_expand_aes256_gcm_no_check( (uint32_t)(uint64_t)plain_len % (uint32_t)16U * sizeof (uint8_t)); r = EverCrypt_Error_Success; } - return EverCrypt_Error_Success; + #elif HACL_CAN_COMPILE_AESNI_PCLMUL + uint8_t ek[352U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_256_GCM_NI_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_NI_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } #else - KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", - __FILE__, - __LINE__, - "EverCrypt was compiled on a system which doesn\'t support Vale"); - KRML_HOST_EXIT(255U); + uint8_t ek[1184U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_256_GCM_CT64_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_CT64_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } #endif + return EverCrypt_Error_Success; } EverCrypt_Error_error_code @@ -909,12 +1106,12 @@ EverCrypt_AEAD_encrypt_expand_aes128_gcm( uint8_t *tag ) { + #if HACL_CAN_COMPILE_VALE bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t ek[480U] = { 0U }; @@ -1029,10 +1226,70 @@ EverCrypt_AEAD_encrypt_expand_aes128_gcm( (uint32_t)(uint64_t)plain_len % (uint32_t)16U * sizeof (uint8_t)); r = EverCrypt_Error_Success; } - return EverCrypt_Error_Success; } + else #endif - return EverCrypt_Error_UnsupportedAlgorithm; + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t ek[288U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_NI_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } + } + else + #endif + { + uint8_t ek[928U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_128_GCM_CT64_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } + } + return EverCrypt_Error_Success; } EverCrypt_Error_error_code @@ -1048,12 +1305,12 @@ EverCrypt_AEAD_encrypt_expand_aes256_gcm( uint8_t *tag ) { + #if HACL_CAN_COMPILE_VALE bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t ek[544U] = { 0U }; @@ -1168,10 +1425,70 @@ EverCrypt_AEAD_encrypt_expand_aes256_gcm( (uint32_t)(uint64_t)plain_len % (uint32_t)16U * sizeof (uint8_t)); r = EverCrypt_Error_Success; } - return EverCrypt_Error_Success; } - #endif - return EverCrypt_Error_UnsupportedAlgorithm; + else + #endif + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t ek[352U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_256_GCM_NI_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_NI_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } + } + else + #endif + { + uint8_t ek[1184U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_256_GCM_CT64_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + EverCrypt_Error_error_code r; + if (s == NULL) + { + r = EverCrypt_Error_InvalidKey; + } + else if (iv_len == (uint32_t)0U) + { + r = EverCrypt_Error_InvalidIVLength; + } + else + { + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* out = (uint8_t*)KRML_HOST_MALLOC(plain_len + 16); + Hacl_AES_256_GCM_CT64_aes256_gcm_encrypt(aes_gcm_ctx, plain_len, out, plain, ad_len, ad, iv_len, iv); + memcpy(cipher, out, plain_len); + memcpy(tag, out + plain_len, 16); + KRML_HOST_FREE(out); + r = EverCrypt_Error_Success; + } + } + return EverCrypt_Error_Success; } EverCrypt_Error_error_code @@ -1439,7 +1756,7 @@ decrypt_aes128_gcm_aesni_pclmul( } static EverCrypt_Error_error_code -decrypt_aes128_gcm_m32( +decrypt_aes128_gcm_ct64( EverCrypt_AEAD_state_s *s, uint8_t *iv, uint32_t iv_len, @@ -1465,7 +1782,7 @@ decrypt_aes128_gcm_m32( uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); memcpy(in, cipher, cipher_len); memcpy(in + cipher_len, tag, 16); - bool r = Hacl_AES_128_GCM_M32_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + bool r = Hacl_AES_128_GCM_CT64_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); KRML_HOST_FREE(in); if (r) { @@ -1608,6 +1925,86 @@ decrypt_aes256_gcm( #endif } +static EverCrypt_Error_error_code +decrypt_aes256_gcm_aesni_pclmul( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *cipher, + uint32_t cipher_len, + uint8_t *tag, + uint8_t *dst +) +{ + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_NI_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + #else + KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", + __FILE__, + __LINE__, + "statically unreachable"); + KRML_HOST_EXIT(255U); + #endif +} + +static EverCrypt_Error_error_code +decrypt_aes256_gcm_ct64( + EverCrypt_AEAD_state_s *s, + uint8_t *iv, + uint32_t iv_len, + uint8_t *ad, + uint32_t ad_len, + uint8_t *cipher, + uint32_t cipher_len, + uint8_t *tag, + uint8_t *dst +) +{ + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_CT64_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; +} + static EverCrypt_Error_error_code decrypt_chacha20_poly1305( EverCrypt_AEAD_state_s *s, @@ -1696,21 +2093,29 @@ EverCrypt_AEAD_decrypt( { return decrypt_aes128_gcm(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + { + return decrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + } + case Spec_Cipher_Expansion_CT64_AES128: + { + return decrypt_aes128_gcm_ct64(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + } case Spec_Cipher_Expansion_Vale_AES256: { return decrypt_aes256_gcm(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } - case Spec_Cipher_Expansion_Hacl_CHACHA20: + case Spec_Cipher_Expansion_AESNI_PCLMUL_AES256: { - return decrypt_chacha20_poly1305(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + return decrypt_aes256_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } - case Spec_Cipher_Expansion_AESNI_PCLMUL_AES128: + case Spec_Cipher_Expansion_CT64_AES256: { - return decrypt_aes128_gcm_aesni_pclmul(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + return decrypt_aes256_gcm_ct64(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } - case Spec_Cipher_Expansion_M32_AES128: + case Spec_Cipher_Expansion_Hacl_CHACHA20: { - return decrypt_aes128_gcm_m32(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); + return decrypt_chacha20_poly1305(s, iv, iv_len, ad, ad_len, cipher, cipher_len, tag, dst); } default: { @@ -1721,12 +2126,9 @@ EverCrypt_AEAD_decrypt( } /** -WARNING: this function doesn't perform any dynamic - hardware check. You MUST make sure your hardware supports the - implementation of AESGCM. Besides, this function was not designed - for cross-compilation: if you compile it on a system which doesn't - support Vale, it will compile it to a function which makes the - program exit. +WARNING: this function doesn't perform any dynamic hardware + check. You need to configure particular flags to control hardware + fetures that this function uses on target processor architecture. */ EverCrypt_Error_error_code EverCrypt_AEAD_decrypt_expand_aes128_gcm_no_check( @@ -1860,22 +2262,67 @@ EverCrypt_AEAD_decrypt_expand_aes128_gcm_no_check( return EverCrypt_Error_Success; } return EverCrypt_Error_AuthenticationFailure; + #elif HACL_CAN_COMPILE_AESNI_PCLMUL + uint8_t ek[288U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_NI_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; #else - KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", - __FILE__, - __LINE__, - "EverCrypt was compiled on a system which doesn\'t support Vale"); - KRML_HOST_EXIT(255U); + uint8_t ek[928U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_128_GCM_CT64_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_CT64_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; #endif } /** -WARNING: this function doesn't perform any dynamic - hardware check. You MUST make sure your hardware supports the - implementation of AESGCM. Besides, this function was not designed - for cross-compilation: if you compile it on a system which doesn't - support Vale, it will compile it to a function which makes the - program exit. +WARNING: this function doesn't perform any dynamic hardware + check. You need to configure particular flags to control hardware + fetures that this function uses on target processor architecture. */ EverCrypt_Error_error_code EverCrypt_AEAD_decrypt_expand_aes256_gcm_no_check( @@ -2009,12 +2456,60 @@ EverCrypt_AEAD_decrypt_expand_aes256_gcm_no_check( return EverCrypt_Error_Success; } return EverCrypt_Error_AuthenticationFailure; + #elif HACL_CAN_COMPILE_AESNI_PCLMUL + uint8_t ek[352U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_256_GCM_NI_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_NI_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; #else - KRML_HOST_EPRINTF("KaRaMeL abort at %s:%d\n%s\n", - __FILE__, - __LINE__, - "EverCrypt was compiled on a system which doesn\'t support Vale"); - KRML_HOST_EXIT(255U); + uint8_t ek[1184U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_256_GCM_CT64_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_CT64_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; #endif } @@ -2031,12 +2526,12 @@ EverCrypt_AEAD_decrypt_expand_aes128_gcm( uint8_t *dst ) { + #if HACL_CAN_COMPILE_VALE bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t ek[480U] = { 0U }; @@ -2164,8 +2659,68 @@ EverCrypt_AEAD_decrypt_expand_aes128_gcm( } } } + else + #endif + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t ek[288U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_128_GCM_NI_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_NI_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + } + else #endif - return EverCrypt_Error_UnsupportedAlgorithm; + { + uint8_t ek[928U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_128_GCM_CT64_aes128_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES128, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_128_GCM_CT64_aes128_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + } } EverCrypt_Error_error_code @@ -2181,12 +2736,12 @@ EverCrypt_AEAD_decrypt_expand_aes256_gcm( uint8_t *dst ) { + #if HACL_CAN_COMPILE_VALE bool has_pclmulqdq = EverCrypt_AutoConfig2_has_pclmulqdq(); bool has_avx = EverCrypt_AutoConfig2_has_avx(); bool has_sse = EverCrypt_AutoConfig2_has_sse(); bool has_movbe = EverCrypt_AutoConfig2_has_movbe(); bool has_aesni = EverCrypt_AutoConfig2_has_aesni(); - #if HACL_CAN_COMPILE_VALE if (has_aesni && has_pclmulqdq && has_avx && has_sse && has_movbe) { uint8_t ek[544U] = { 0U }; @@ -2314,8 +2869,68 @@ EverCrypt_AEAD_decrypt_expand_aes256_gcm( } } } + else + #endif + #if HACL_CAN_COMPILE_AESNI_PCLMUL + if (hacl_aesgcm_support() != 0) + { + uint8_t ek[352U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx0 = (Lib_IntVector_Intrinsics_vec128 *)ek; + Hacl_AES_256_GCM_NI_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_AESNI_PCLMUL_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + Lib_IntVector_Intrinsics_vec128 *aes_gcm_ctx = (Lib_IntVector_Intrinsics_vec128 *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_NI_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + } + else #endif - return EverCrypt_Error_UnsupportedAlgorithm; + { + uint8_t ek[1184U] = { 0U }; + uint64_t *aes_gcm_ctx0 = (uint64_t *)ek; + Hacl_AES_256_GCM_CT64_aes256_gcm_init(aes_gcm_ctx0, k); + EverCrypt_AEAD_state_s p = { .impl = Spec_Cipher_Expansion_CT64_AES256, .ek = ek }; + EverCrypt_AEAD_state_s *s = &p; + if (s == NULL) + { + return EverCrypt_Error_InvalidKey; + } + if (iv_len == (uint32_t)0U) + { + return EverCrypt_Error_InvalidIVLength; + } + EverCrypt_AEAD_state_s scrut = *s; + uint8_t *ek0 = scrut.ek; + uint64_t *aes_gcm_ctx = (uint64_t *)ek0; + uint8_t* in = (uint8_t*)KRML_HOST_MALLOC(cipher_len + 16); + memcpy(in, cipher, cipher_len); + memcpy(in + cipher_len, tag, 16); + bool r = Hacl_AES_256_GCM_CT64_aes256_gcm_decrypt(aes_gcm_ctx, cipher_len, dst, in, ad_len, ad, iv_len, iv); + KRML_HOST_FREE(in); + if (r) + { + return EverCrypt_Error_Success; + } + return EverCrypt_Error_AuthenticationFailure; + } } EverCrypt_Error_error_code diff --git a/src/msvc/Hacl_AES_128_BitSlice.c b/src/msvc/Hacl_AES_128_CTR32_BitSlice.c similarity index 98% rename from src/msvc/Hacl_AES_128_BitSlice.c rename to src/msvc/Hacl_AES_128_CTR32_BitSlice.c index a0d2938f..6af2a7c0 100644 --- a/src/msvc/Hacl_AES_128_BitSlice.c +++ b/src/msvc/Hacl_AES_128_CTR32_BitSlice.c @@ -23,7 +23,7 @@ */ -#include "internal/Hacl_AES_128_BitSlice.h" +#include "internal/Hacl_AES_128_CTR32_BitSlice.h" #include "internal/Hacl_Lib.h" @@ -639,7 +639,7 @@ Hacl_Impl_AES_Generic_aes256_ctr_bitslice( } } -void Hacl_AES_128_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce) +void Hacl_AES_128_CTR32_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce) { uint64_t *kex = ctx + (uint32_t)8U; uint64_t *n = ctx; @@ -778,13 +778,13 @@ void Hacl_AES_128_BitSlice_aes128_init(uint64_t *ctx, uint8_t *key, uint8_t *non Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); } -void Hacl_AES_128_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce) +void Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(uint64_t *ctx, uint8_t *nonce) { uint64_t *n = ctx; Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); } -void Hacl_AES_128_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter) +void Hacl_AES_128_CTR32_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter) { uint64_t *kex = ctx + (uint32_t)8U; uint64_t *n = ctx; @@ -806,7 +806,7 @@ void Hacl_AES_128_BitSlice_aes128_key_block(uint8_t *kb, uint64_t *ctx, uint32_t } inline void -Hacl_AES_128_BitSlice_aes128_ctr_encrypt( +Hacl_AES_128_CTR32_BitSlice_aes128_ctr_encrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -955,7 +955,7 @@ Hacl_AES_128_BitSlice_aes128_ctr_encrypt( } inline void -Hacl_AES_128_BitSlice_aes128_ctr_decrypt( +Hacl_AES_128_CTR32_BitSlice_aes128_ctr_decrypt( uint32_t len, uint8_t *out, uint8_t *inp, diff --git a/src/msvc/Hacl_AES_128_NI.c b/src/msvc/Hacl_AES_128_CTR32_NI.c similarity index 96% rename from src/msvc/Hacl_AES_128_NI.c rename to src/msvc/Hacl_AES_128_CTR32_NI.c index 4a9d9ca8..21b2f898 100644 --- a/src/msvc/Hacl_AES_128_NI.c +++ b/src/msvc/Hacl_AES_128_CTR32_NI.c @@ -23,10 +23,14 @@ */ -#include "Hacl_AES_128_NI.h" +#include "Hacl_AES_128_CTR32_NI.h" void -Hacl_AES_128_NI_aes128_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key, uint8_t *nonce) +Hacl_AES_128_CTR32_NI_aes128_init( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint8_t *key, + uint8_t *nonce +) { Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = ctx; @@ -278,7 +282,8 @@ Hacl_AES_128_NI_aes128_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key, n[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); } -void Hacl_AES_128_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce) +void +Hacl_AES_128_CTR32_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce) { Lib_IntVector_Intrinsics_vec128 *n = ctx; uint8_t nb[16U] = { 0U }; @@ -287,7 +292,7 @@ void Hacl_AES_128_NI_aes128_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint } void -Hacl_AES_128_NI_aes128_key_block( +Hacl_AES_128_CTR32_NI_aes128_key_block( uint8_t *kb, Lib_IntVector_Intrinsics_vec128 *ctx, uint32_t counter @@ -296,14 +301,13 @@ Hacl_AES_128_NI_aes128_key_block( Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = ctx; KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; - uint32_t counter1 = counter; - uint32_t counter0 = htobe32(counter1); - uint32_t counter11 = htobe32(counter1 + (uint32_t)1U); - uint32_t counter2 = htobe32(counter1 + (uint32_t)2U); - uint32_t counter3 = htobe32(counter1 + (uint32_t)3U); + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); - st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter11, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); uint32_t klen = (uint32_t)1U; @@ -331,7 +335,7 @@ Hacl_AES_128_NI_aes128_key_block( } inline void -Hacl_AES_128_NI_aes128_ctr_encrypt( +Hacl_AES_128_CTR32_NI_aes128_ctr_encrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -598,11 +602,10 @@ Hacl_AES_128_NI_aes128_ctr_encrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n1 = ctx; - uint32_t counter = ctr; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -656,11 +659,10 @@ Hacl_AES_128_NI_aes128_ctr_encrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n1 = ctx; - uint32_t counter = ctr; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -707,7 +709,7 @@ Hacl_AES_128_NI_aes128_ctr_encrypt( } inline void -Hacl_AES_128_NI_aes128_ctr_decrypt( +Hacl_AES_128_CTR32_NI_aes128_ctr_decrypt( uint32_t len, uint8_t *out, uint8_t *inp, @@ -974,11 +976,10 @@ Hacl_AES_128_NI_aes128_ctr_decrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n1 = ctx; - uint32_t counter = ctr; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -1032,11 +1033,10 @@ Hacl_AES_128_NI_aes128_ctr_decrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n1 = ctx; - uint32_t counter = ctr; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); diff --git a/src/msvc/Hacl_AES_128_GCM_CT64.c b/src/msvc/Hacl_AES_128_GCM_CT64.c new file mode 100644 index 00000000..1051af30 --- /dev/null +++ b/src/msvc/Hacl_AES_128_GCM_CT64.c @@ -0,0 +1,210 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_128_GCM_CT64.h" + +#include "internal/Hacl_AES_128_CTR32_BitSlice.h" + +uint32_t Hacl_AES_128_GCM_CT64_aes_gcm_ctx_len = (uint32_t)116U; + +void Hacl_AES_128_GCM_CT64_aes128_gcm_init(uint64_t *ctx, uint8_t *key) +{ + uint8_t gcm_key[16U] = { 0U }; + uint8_t nonce0[12U] = { 0U }; + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)96U; + Hacl_AES_128_CTR32_BitSlice_aes128_init(aes_ctx, key, nonce0); + Hacl_AES_128_CTR32_BitSlice_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Hacl_Gf128_CT64_gcm_init(gcm_ctx, gcm_key); +} + +void +Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t tmp[16U] = { 0U }; + uint8_t *cip = out; + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)96U; + uint64_t *tag_mix = ctx + (uint32_t)114U; + uint32_t ctr; + uint8_t tag_mix10[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + uint64_t *aes_ctx1 = ctx; + Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(aes_ctx1, iv); + Hacl_AES_128_CTR32_BitSlice_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); + uint64_t u = load64_le(tag_mix10); + ctx[114U] = u; + uint64_t u0 = load64_le(tag_mix10 + (uint32_t)8U); + ctx[115U] = u0; + ctr = (uint32_t)2U; + } + else + { + uint64_t *aes_ctx1 = ctx; + uint64_t *gcm_ctx1 = ctx + (uint32_t)96U; + store64_be(gcm_key + (uint32_t)8U, gcm_ctx1[8U]); + store64_be(gcm_key, gcm_ctx1[9U]); + Hacl_Gf128_CT64_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_CT64_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(aes_ctx1, tag_iv); + uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u0; + Hacl_AES_128_CTR32_BitSlice_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + uint64_t u = load64_le(tag_mix1); + ctx[114U] = u; + uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); + ctx[115U] = u1; + ctr = ctr0 + (uint32_t)1U; + } + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, cip, text, aes_ctx, ctr); + gcm_ctx[0U] = (uint64_t)0U; + gcm_ctx[1U] = (uint64_t)0U; + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, len, cip); + store64_be(tmp, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(tmp + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_CT64_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); + Hacl_Gf128_CT64_gcm_emit(tmp, gcm_ctx); + uint64_t u0 = load64_le(tmp); + uint64_t tmp0 = u0; + uint64_t u = load64_le(tmp + (uint32_t)8U); + uint64_t tmp1 = u; + uint64_t tmp01 = tmp0 ^ tag_mix[0U]; + uint64_t tmp11 = tmp1 ^ tag_mix[1U]; + store64_le(out + len, tmp01); + store64_le(out + len + (uint32_t)8U, tmp11); +} + +bool +Hacl_AES_128_GCM_CT64_aes128_gcm_decrypt( + uint64_t *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t scratch[18U] = { 0U }; + uint8_t *text = scratch; + uint8_t *result = scratch + (uint32_t)17U; + uint8_t *ciphertext = cipher; + uint8_t *tag = cipher + len; + uint32_t ctr; + uint8_t tag_mix0[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + uint64_t *aes_ctx = ctx; + Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(aes_ctx, iv); + Hacl_AES_128_CTR32_BitSlice_aes128_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + uint64_t u = load64_le(tag_mix0); + ctx[114U] = u; + uint64_t u0 = load64_le(tag_mix0 + (uint32_t)8U); + ctx[115U] = u0; + ctr = (uint32_t)2U; + } + else + { + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)96U; + store64_be(gcm_key + (uint32_t)8U, gcm_ctx[8U]); + store64_be(gcm_key, gcm_ctx[9U]); + Hacl_Gf128_CT64_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_CT64_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_128_CTR32_BitSlice_aes128_set_nonce(aes_ctx, tag_iv); + uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u0; + Hacl_AES_128_CTR32_BitSlice_aes128_key_block(tag_mix1, aes_ctx, ctr0); + uint64_t u = load64_le(tag_mix1); + ctx[114U] = u; + uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); + ctx[115U] = u1; + ctr = ctr0 + (uint32_t)1U; + } + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)96U; + uint64_t *tag_mix = ctx + (uint32_t)114U; + gcm_ctx[0U] = (uint64_t)0U; + gcm_ctx[1U] = (uint64_t)0U; + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, len, ciphertext); + store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(text + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_CT64_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); + Hacl_Gf128_CT64_gcm_emit(text, gcm_ctx); + uint64_t u0 = load64_le(text); + uint64_t text0 = u0; + uint64_t u = load64_le(text + (uint32_t)8U); + uint64_t text1 = u; + uint64_t text01 = text0 ^ tag_mix[0U]; + uint64_t text11 = text1 ^ tag_mix[1U]; + store64_le(text, text01); + store64_le(text + (uint32_t)8U, text11); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + result[0U] = result[0U] | (text[i] ^ tag[i]);); + uint8_t res8 = result[0U]; + if (res8 == (uint8_t)0U) + { + Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, out, ciphertext, aes_ctx, ctr); + return true; + } + return false; +} + diff --git a/src/msvc/Hacl_AES_128_GCM_NI.c b/src/msvc/Hacl_AES_128_GCM_NI.c index 16e03251..1884764e 100644 --- a/src/msvc/Hacl_AES_128_GCM_NI.c +++ b/src/msvc/Hacl_AES_128_GCM_NI.c @@ -30,9 +30,9 @@ void Hacl_AES_128_GCM_NI_aes128_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, u uint8_t gcm_key[16U] = { 0U }; uint8_t nonce0[12U] = { 0U }; Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; - Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; - Hacl_AES_128_NI_aes128_init(aes_ctx, key, nonce0); - Hacl_AES_128_NI_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)12U; + Hacl_AES_128_CTR32_NI_aes128_init(aes_ctx, key, nonce0); + Hacl_AES_128_CTR32_NI_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); Hacl_Gf128_NI_gcm_init(gcm_ctx, gcm_key); } @@ -57,15 +57,15 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( if (iv_len == (uint32_t)12U) { Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; - Hacl_AES_128_NI_aes128_set_nonce(aes_ctx, iv); - Hacl_AES_128_NI_aes128_key_block(tag_mix0, aes_ctx, (uint32_t)1U); - ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); + Hacl_AES_128_CTR32_NI_aes128_set_nonce(aes_ctx, iv); + Hacl_AES_128_CTR32_NI_aes128_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + ctx[17U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); ctr = (uint32_t)2U; } else { Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; - Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)12U; Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx[4U]); Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); @@ -75,11 +75,11 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( (uint32_t)1U, size_iv[i] = tag_iv[i] ^ size_iv[i];); Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); - Hacl_AES_128_NI_aes128_set_nonce(aes_ctx, tag_iv); + Hacl_AES_128_CTR32_NI_aes128_set_nonce(aes_ctx, tag_iv); uint32_t u = load32_be(tag_iv + (uint32_t)12U); uint32_t ctr0 = u; - Hacl_AES_128_NI_aes128_key_block(tag_mix1, aes_ctx, ctr0); - ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + Hacl_AES_128_CTR32_NI_aes128_key_block(tag_mix1, aes_ctx, ctr0); + ctx[17U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); ctr = ctr0 + (uint32_t)1U; } uint8_t *cip = out; @@ -93,11 +93,10 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; - uint32_t counter = ctr1; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr1); + uint32_t counter1 = htobe32(ctr1 + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr1 + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr1 + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -151,11 +150,10 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; - uint32_t counter = ctr1; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr1); + uint32_t counter1 = htobe32(ctr1 + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr1 + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr1 + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -199,8 +197,9 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); memcpy(ob, last, rem * sizeof (uint8_t)); } - Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; - Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[17U]; + gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, cip); uint8_t tmp[16U] = { 0U }; @@ -212,7 +211,6 @@ Hacl_AES_128_GCM_NI_aes128_gcm_encrypt( Lib_IntVector_Intrinsics_vec128 tmp_vec1 = Lib_IntVector_Intrinsics_vec128_xor(tmp_vec, tag_mix); Lib_IntVector_Intrinsics_vec128_store128_le(out + len, tmp_vec1); - gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; } bool @@ -232,28 +230,25 @@ Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( uint8_t *result = scratch + (uint32_t)17U; uint8_t *ciphertext = cipher; uint8_t *tag = cipher + len; - Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; - Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; - Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; uint32_t ctr; - uint8_t tag_mix10[16U] = { 0U }; + uint8_t tag_mix0[16U] = { 0U }; uint8_t gcm_key[16U] = { 0U }; uint8_t tag_iv[16U] = { 0U }; uint8_t size_iv[16U] = { 0U }; uint8_t tag_mix1[16U] = { 0U }; if (iv_len == (uint32_t)12U) { - Lib_IntVector_Intrinsics_vec128 *aes_ctx1 = ctx; - Hacl_AES_128_NI_aes128_set_nonce(aes_ctx1, iv); - Hacl_AES_128_NI_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); - ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix10); + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Hacl_AES_128_CTR32_NI_aes128_set_nonce(aes_ctx, iv); + Hacl_AES_128_CTR32_NI_aes128_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + ctx[17U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); ctr = (uint32_t)2U; } else { - Lib_IntVector_Intrinsics_vec128 *aes_ctx1 = ctx; - Lib_IntVector_Intrinsics_vec128 *gcm_ctx1 = ctx + (uint32_t)16U; - Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx1[4U]); + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx[4U]); Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); KRML_MAYBE_FOR16(i, @@ -262,13 +257,17 @@ Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( (uint32_t)1U, size_iv[i] = tag_iv[i] ^ size_iv[i];); Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); - Hacl_AES_128_NI_aes128_set_nonce(aes_ctx1, tag_iv); + Hacl_AES_128_CTR32_NI_aes128_set_nonce(aes_ctx, tag_iv); uint32_t u = load32_be(tag_iv + (uint32_t)12U); uint32_t ctr0 = u; - Hacl_AES_128_NI_aes128_key_block(tag_mix1, aes_ctx1, ctr0); - ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + Hacl_AES_128_CTR32_NI_aes128_key_block(tag_mix1, aes_ctx, ctr0); + ctx[17U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); ctr = ctr0 + (uint32_t)1U; } + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[17U]; + gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, ciphertext); store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); @@ -296,11 +295,10 @@ Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; - uint32_t counter = ctr1; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr1); + uint32_t counter1 = htobe32(ctr1 + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr1 + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr1 + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); @@ -354,11 +352,10 @@ Hacl_AES_128_GCM_NI_aes128_gcm_decrypt( KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; Lib_IntVector_Intrinsics_vec128 *kex = aes_ctx + (uint32_t)1U; Lib_IntVector_Intrinsics_vec128 *n = aes_ctx; - uint32_t counter = ctr1; - uint32_t counter0 = htobe32(counter); - uint32_t counter1 = htobe32(counter + (uint32_t)1U); - uint32_t counter2 = htobe32(counter + (uint32_t)2U); - uint32_t counter3 = htobe32(counter + (uint32_t)3U); + uint32_t counter0 = htobe32(ctr1); + uint32_t counter1 = htobe32(ctr1 + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr1 + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr1 + (uint32_t)3U); Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); diff --git a/src/msvc/Hacl_AES_256_CTR32_BitSlice.c b/src/msvc/Hacl_AES_256_CTR32_BitSlice.c new file mode 100644 index 00000000..461e3153 --- /dev/null +++ b/src/msvc/Hacl_AES_256_CTR32_BitSlice.c @@ -0,0 +1,634 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_256_CTR32_BitSlice.h" + +#include "internal/Hacl_AES_128_CTR32_BitSlice.h" + +void Hacl_AES_256_CTR32_BitSlice_aes256_init(uint64_t *ctx, uint8_t *key, uint8_t *nonce) +{ + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + uint32_t klen = (uint32_t)8U; + uint64_t *next0 = kex; + uint64_t *next1 = kex + klen; + Hacl_Impl_AES_CoreBitSlice_load_key1(next0, key); + Hacl_Impl_AES_CoreBitSlice_load_key1(next1, key + (uint32_t)16U); + uint64_t *prev0 = next0; + uint64_t *prev1 = next1; + uint64_t *next01 = kex + klen * (uint32_t)2U; + uint64_t *next11 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next01, prev1, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next01[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next01[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next01, prev0); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next11, next01, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next11[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next11[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next11, prev1); + uint64_t *prev01 = next01; + uint64_t *prev11 = next11; + uint64_t *next02 = kex + klen * (uint32_t)4U; + uint64_t *next12 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next02, prev11, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next02[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next02[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next02, prev01); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next12, next02, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next12[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next12[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next12, prev11); + uint64_t *prev02 = next02; + uint64_t *prev12 = next12; + uint64_t *next03 = kex + klen * (uint32_t)6U; + uint64_t *next13 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next03, prev12, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next03[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next03[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next03, prev02); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next13, next03, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next13[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next13[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next13, prev12); + uint64_t *prev03 = next03; + uint64_t *prev13 = next13; + uint64_t *next04 = kex + klen * (uint32_t)8U; + uint64_t *next14 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next04, prev13, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next04[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next04[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next04, prev03); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next14, next04, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next14[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next14[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next14, prev13); + uint64_t *prev04 = next04; + uint64_t *prev14 = next14; + uint64_t *next05 = kex + klen * (uint32_t)10U; + uint64_t *next15 = kex + klen * (uint32_t)11U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next05, prev14, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next05[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next05[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next05, prev04); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next15, next05, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next15[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next15[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next15, prev14); + uint64_t *prev05 = next05; + uint64_t *prev15 = next15; + uint64_t *next06 = kex + klen * (uint32_t)12U; + uint64_t *next16 = kex + klen * (uint32_t)13U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next06, prev15, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next06[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next06[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next06, prev05); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next16, next06, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next16[i]; + uint64_t n2 = n1 & (uint64_t)0x0f000f000f000f00U; + uint64_t n3 = n2 ^ n2 << (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next16[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next16, prev15); + uint64_t *prev06 = next06; + uint64_t *prev16 = next16; + uint64_t *next07 = kex + klen * (uint32_t)14U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next07, prev16, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n1 = next07[i]; + uint64_t n2 = n1 & (uint64_t)0xf000f000f000f000U; + uint64_t n3 = n2 ^ n2 >> (uint32_t)4U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)8U; + next07[i] = n4;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next07, prev06); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); +} + +void Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(uint64_t *ctx, uint8_t *nonce) +{ + uint64_t *n = ctx; + Hacl_Impl_AES_CoreBitSlice_load_nonce(n, nonce); +} + +void Hacl_AES_256_CTR32_BitSlice_aes256_key_block(uint8_t *kb, uint64_t *ctx, uint32_t counter) +{ + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n = ctx; + uint64_t st[8U] = { 0U }; + Hacl_Impl_AES_CoreBitSlice_load_state(st, n, counter); + uint32_t klen = (uint32_t)8U; + uint64_t *k0 = kex; + uint64_t *kr = kex + klen; + uint64_t *kn = kex + (uint32_t)14U * klen; + Hacl_Impl_AES_CoreBitSlice_xor_state_key1(st, k0); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + uint64_t *sub_key = kr + i * (uint32_t)8U; + Hacl_Impl_AES_CoreBitSlice_aes_enc(st, sub_key);); + Hacl_Impl_AES_CoreBitSlice_aes_enc_last(st, kn); + Hacl_Impl_AES_CoreBitSlice_store_block0(kb, st); +} + +void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint64_t *ctx, + uint32_t c +) +{ + Hacl_Impl_AES_Generic_aes256_ctr_bitslice(len, out, inp, ctx, c); +} + +inline void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + uint64_t ctx[128U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n1 = ctx; + uint32_t klen = (uint32_t)8U; + uint64_t *next0 = kex; + uint64_t *next1 = kex + klen; + Hacl_Impl_AES_CoreBitSlice_load_key1(next0, k); + Hacl_Impl_AES_CoreBitSlice_load_key1(next1, k + (uint32_t)16U); + uint64_t *prev0 = next0; + uint64_t *prev1 = next1; + uint64_t *next01 = kex + klen * (uint32_t)2U; + uint64_t *next11 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next01, prev1, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next01[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next01[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next01, prev0); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next11, next01, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next11[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next11[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next11, prev1); + uint64_t *prev01 = next01; + uint64_t *prev11 = next11; + uint64_t *next02 = kex + klen * (uint32_t)4U; + uint64_t *next12 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next02, prev11, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next02[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next02[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next02, prev01); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next12, next02, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next12[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next12[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next12, prev11); + uint64_t *prev02 = next02; + uint64_t *prev12 = next12; + uint64_t *next03 = kex + klen * (uint32_t)6U; + uint64_t *next13 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next03, prev12, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next03[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next03[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next03, prev02); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next13, next03, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next13[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next13[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next13, prev12); + uint64_t *prev03 = next03; + uint64_t *prev13 = next13; + uint64_t *next04 = kex + klen * (uint32_t)8U; + uint64_t *next14 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next04, prev13, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next04[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next04[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next04, prev03); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next14, next04, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next14[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next14[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next14, prev13); + uint64_t *prev04 = next04; + uint64_t *prev14 = next14; + uint64_t *next05 = kex + klen * (uint32_t)10U; + uint64_t *next15 = kex + klen * (uint32_t)11U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next05, prev14, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next05[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next05[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next05, prev04); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next15, next05, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next15[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next15[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next15, prev14); + uint64_t *prev05 = next05; + uint64_t *prev15 = next15; + uint64_t *next06 = kex + klen * (uint32_t)12U; + uint64_t *next16 = kex + klen * (uint32_t)13U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next06, prev15, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next06[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next06[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next06, prev05); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next16, next06, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next16[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next16[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next16, prev15); + uint64_t *prev06 = next06; + uint64_t *prev16 = next16; + uint64_t *next07 = kex + klen * (uint32_t)14U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next07, prev16, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next07[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next07[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next07, prev06); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n1, n); + Hacl_Impl_AES_Generic_aes256_ctr_bitslice(len, out, inp, ctx, c); +} + +inline void +Hacl_AES_256_CTR32_BitSlice_aes256_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + uint64_t ctx[128U] = { 0U }; + uint64_t *kex = ctx + (uint32_t)8U; + uint64_t *n1 = ctx; + uint32_t klen = (uint32_t)8U; + uint64_t *next0 = kex; + uint64_t *next1 = kex + klen; + Hacl_Impl_AES_CoreBitSlice_load_key1(next0, k); + Hacl_Impl_AES_CoreBitSlice_load_key1(next1, k + (uint32_t)16U); + uint64_t *prev0 = next0; + uint64_t *prev1 = next1; + uint64_t *next01 = kex + klen * (uint32_t)2U; + uint64_t *next11 = kex + klen * (uint32_t)3U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next01, prev1, (uint8_t)0x01U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next01[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next01[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next01, prev0); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next11, next01, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next11[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next11[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next11, prev1); + uint64_t *prev01 = next01; + uint64_t *prev11 = next11; + uint64_t *next02 = kex + klen * (uint32_t)4U; + uint64_t *next12 = kex + klen * (uint32_t)5U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next02, prev11, (uint8_t)0x02U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next02[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next02[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next02, prev01); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next12, next02, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next12[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next12[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next12, prev11); + uint64_t *prev02 = next02; + uint64_t *prev12 = next12; + uint64_t *next03 = kex + klen * (uint32_t)6U; + uint64_t *next13 = kex + klen * (uint32_t)7U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next03, prev12, (uint8_t)0x04U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next03[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next03[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next03, prev02); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next13, next03, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next13[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next13[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next13, prev12); + uint64_t *prev03 = next03; + uint64_t *prev13 = next13; + uint64_t *next04 = kex + klen * (uint32_t)8U; + uint64_t *next14 = kex + klen * (uint32_t)9U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next04, prev13, (uint8_t)0x08U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next04[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next04[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next04, prev03); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next14, next04, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next14[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next14[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next14, prev13); + uint64_t *prev04 = next04; + uint64_t *prev14 = next14; + uint64_t *next05 = kex + klen * (uint32_t)10U; + uint64_t *next15 = kex + klen * (uint32_t)11U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next05, prev14, (uint8_t)0x10U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next05[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next05[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next05, prev04); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next15, next05, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next15[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next15[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next15, prev14); + uint64_t *prev05 = next05; + uint64_t *prev15 = next15; + uint64_t *next06 = kex + klen * (uint32_t)12U; + uint64_t *next16 = kex + klen * (uint32_t)13U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next06, prev15, (uint8_t)0x20U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next06[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next06[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next06, prev05); + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next16, next06, (uint8_t)0U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next16[i]; + uint64_t n3 = n2 & (uint64_t)0x0f000f000f000f00U; + uint64_t n4 = n3 ^ n3 << (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next16[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next16, prev15); + uint64_t *prev06 = next06; + uint64_t *prev16 = next16; + uint64_t *next07 = kex + klen * (uint32_t)14U; + Hacl_Impl_AES_CoreBitSlice_aes_keygen_assist(next07, prev16, (uint8_t)0x40U); + KRML_MAYBE_FOR8(i, + (uint32_t)0U, + (uint32_t)8U, + (uint32_t)1U, + uint64_t n2 = next07[i]; + uint64_t n3 = n2 & (uint64_t)0xf000f000f000f000U; + uint64_t n4 = n3 ^ n3 >> (uint32_t)4U; + uint64_t n5 = n4 ^ n4 >> (uint32_t)8U; + next07[i] = n5;); + Hacl_Impl_AES_CoreBitSlice_key_expansion_step(next07, prev06); + Hacl_Impl_AES_CoreBitSlice_load_nonce(n1, n); + Hacl_Impl_AES_Generic_aes256_ctr_bitslice(len, out, inp, ctx, c); +} + diff --git a/src/msvc/Hacl_AES_256_CTR32_NI.c b/src/msvc/Hacl_AES_256_CTR32_NI.c new file mode 100644 index 00000000..81f94996 --- /dev/null +++ b/src/msvc/Hacl_AES_256_CTR32_NI.c @@ -0,0 +1,1433 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_256_CTR32_NI.h" + +void +Hacl_AES_256_CTR32_NI_aes256_init( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint8_t *key, + uint8_t *nonce +) +{ + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *next0 = kex; + Lib_IntVector_Intrinsics_vec128 *next1 = kex + klen; + next0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(key); + next1[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(key + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 *prev0 = next0; + Lib_IntVector_Intrinsics_vec128 *prev1 = next1; + Lib_IntVector_Intrinsics_vec128 *next01 = kex + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next11 = kex + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x01U); + next01[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key1 = prev0[0U]; + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key4 = + Lib_IntVector_Intrinsics_vec128_xor(key3, + Lib_IntVector_Intrinsics_vec128_shift_left(key3, (uint32_t)32U)); + next01[0U] = Lib_IntVector_Intrinsics_vec128_xor(next01[0U], key4); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next01[0U], (uint8_t)0U); + next11[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v1, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key10 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key40 = + Lib_IntVector_Intrinsics_vec128_xor(key30, + Lib_IntVector_Intrinsics_vec128_shift_left(key30, (uint32_t)32U)); + next11[0U] = Lib_IntVector_Intrinsics_vec128_xor(next11[0U], key40); + Lib_IntVector_Intrinsics_vec128 *prev01 = next01; + Lib_IntVector_Intrinsics_vec128 *prev11 = next11; + Lib_IntVector_Intrinsics_vec128 *next02 = kex + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next12 = kex + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev11[0U], (uint8_t)0x02U); + next02[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v2, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key11 = prev01[0U]; + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key41 = + Lib_IntVector_Intrinsics_vec128_xor(key31, + Lib_IntVector_Intrinsics_vec128_shift_left(key31, (uint32_t)32U)); + next02[0U] = Lib_IntVector_Intrinsics_vec128_xor(next02[0U], key41); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next02[0U], (uint8_t)0U); + next12[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v3, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key12 = prev11[0U]; + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key42 = + Lib_IntVector_Intrinsics_vec128_xor(key32, + Lib_IntVector_Intrinsics_vec128_shift_left(key32, (uint32_t)32U)); + next12[0U] = Lib_IntVector_Intrinsics_vec128_xor(next12[0U], key42); + Lib_IntVector_Intrinsics_vec128 *prev02 = next02; + Lib_IntVector_Intrinsics_vec128 *prev12 = next12; + Lib_IntVector_Intrinsics_vec128 *next03 = kex + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next13 = kex + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev12[0U], (uint8_t)0x04U); + next03[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key13 = prev02[0U]; + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key43 = + Lib_IntVector_Intrinsics_vec128_xor(key33, + Lib_IntVector_Intrinsics_vec128_shift_left(key33, (uint32_t)32U)); + next03[0U] = Lib_IntVector_Intrinsics_vec128_xor(next03[0U], key43); + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next03[0U], (uint8_t)0U); + next13[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key14 = prev12[0U]; + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key44 = + Lib_IntVector_Intrinsics_vec128_xor(key34, + Lib_IntVector_Intrinsics_vec128_shift_left(key34, (uint32_t)32U)); + next13[0U] = Lib_IntVector_Intrinsics_vec128_xor(next13[0U], key44); + Lib_IntVector_Intrinsics_vec128 *prev03 = next03; + Lib_IntVector_Intrinsics_vec128 *prev13 = next13; + Lib_IntVector_Intrinsics_vec128 *next04 = kex + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next14 = kex + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev13[0U], (uint8_t)0x08U); + next04[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key15 = prev03[0U]; + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key45 = + Lib_IntVector_Intrinsics_vec128_xor(key35, + Lib_IntVector_Intrinsics_vec128_shift_left(key35, (uint32_t)32U)); + next04[0U] = Lib_IntVector_Intrinsics_vec128_xor(next04[0U], key45); + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next04[0U], (uint8_t)0U); + next14[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key16 = prev13[0U]; + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key46 = + Lib_IntVector_Intrinsics_vec128_xor(key36, + Lib_IntVector_Intrinsics_vec128_shift_left(key36, (uint32_t)32U)); + next14[0U] = Lib_IntVector_Intrinsics_vec128_xor(next14[0U], key46); + Lib_IntVector_Intrinsics_vec128 *prev04 = next04; + Lib_IntVector_Intrinsics_vec128 *prev14 = next14; + Lib_IntVector_Intrinsics_vec128 *next05 = kex + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 *next15 = kex + klen * (uint32_t)11U; + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev14[0U], (uint8_t)0x10U); + next05[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev04[0U]; + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key47 = + Lib_IntVector_Intrinsics_vec128_xor(key37, + Lib_IntVector_Intrinsics_vec128_shift_left(key37, (uint32_t)32U)); + next05[0U] = Lib_IntVector_Intrinsics_vec128_xor(next05[0U], key47); + Lib_IntVector_Intrinsics_vec128 + v9 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next05[0U], (uint8_t)0U); + next15[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v9, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key18 = prev14[0U]; + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key48 = + Lib_IntVector_Intrinsics_vec128_xor(key38, + Lib_IntVector_Intrinsics_vec128_shift_left(key38, (uint32_t)32U)); + next15[0U] = Lib_IntVector_Intrinsics_vec128_xor(next15[0U], key48); + Lib_IntVector_Intrinsics_vec128 *prev05 = next05; + Lib_IntVector_Intrinsics_vec128 *prev15 = next15; + Lib_IntVector_Intrinsics_vec128 *next06 = kex + klen * (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128 *next16 = kex + klen * (uint32_t)13U; + Lib_IntVector_Intrinsics_vec128 + v10 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev15[0U], (uint8_t)0x20U); + next06[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v10, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key19 = prev05[0U]; + Lib_IntVector_Intrinsics_vec128 + key29 = + Lib_IntVector_Intrinsics_vec128_xor(key19, + Lib_IntVector_Intrinsics_vec128_shift_left(key19, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key39 = + Lib_IntVector_Intrinsics_vec128_xor(key29, + Lib_IntVector_Intrinsics_vec128_shift_left(key29, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key49 = + Lib_IntVector_Intrinsics_vec128_xor(key39, + Lib_IntVector_Intrinsics_vec128_shift_left(key39, (uint32_t)32U)); + next06[0U] = Lib_IntVector_Intrinsics_vec128_xor(next06[0U], key49); + Lib_IntVector_Intrinsics_vec128 + v11 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next06[0U], (uint8_t)0U); + next16[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v11, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key110 = prev15[0U]; + Lib_IntVector_Intrinsics_vec128 + key210 = + Lib_IntVector_Intrinsics_vec128_xor(key110, + Lib_IntVector_Intrinsics_vec128_shift_left(key110, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key310 = + Lib_IntVector_Intrinsics_vec128_xor(key210, + Lib_IntVector_Intrinsics_vec128_shift_left(key210, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key410 = + Lib_IntVector_Intrinsics_vec128_xor(key310, + Lib_IntVector_Intrinsics_vec128_shift_left(key310, (uint32_t)32U)); + next16[0U] = Lib_IntVector_Intrinsics_vec128_xor(next16[0U], key410); + Lib_IntVector_Intrinsics_vec128 *prev06 = next06; + Lib_IntVector_Intrinsics_vec128 *prev16 = next16; + Lib_IntVector_Intrinsics_vec128 *next07 = kex + klen * (uint32_t)14U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev16[0U], (uint8_t)0x40U); + next07[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key111 = prev06[0U]; + Lib_IntVector_Intrinsics_vec128 + key211 = + Lib_IntVector_Intrinsics_vec128_xor(key111, + Lib_IntVector_Intrinsics_vec128_shift_left(key111, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key311 = + Lib_IntVector_Intrinsics_vec128_xor(key211, + Lib_IntVector_Intrinsics_vec128_shift_left(key211, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key411 = + Lib_IntVector_Intrinsics_vec128_xor(key311, + Lib_IntVector_Intrinsics_vec128_shift_left(key311, (uint32_t)32U)); + next07[0U] = Lib_IntVector_Intrinsics_vec128_xor(next07[0U], key411); + uint8_t nb[16U] = { 0U }; + memcpy(nb, nonce, (uint32_t)12U * sizeof (uint8_t)); + n[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); +} + +void +Hacl_AES_256_CTR32_NI_aes256_set_nonce(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *nonce) +{ + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint8_t nb[16U] = { 0U }; + memcpy(nb, nonce, (uint32_t)12U * sizeof (uint8_t)); + n[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); +} + +void +Hacl_AES_256_CTR32_NI_aes256_key_block( + uint8_t *kb, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t counter +) +{ + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + uint32_t counter0 = htobe32(counter); + uint32_t counter1 = htobe32(counter + (uint32_t)1U); + uint32_t counter2 = htobe32(counter + (uint32_t)2U); + uint32_t counter3 = htobe32(counter + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128_store128_le(kb, st[0U]); +} + +void +Hacl_AES_256_CTR32_NI_aes256_ctr( + uint32_t len, + uint8_t *out, + uint8_t *inp, + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t c +) +{ + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = c + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i0, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = c + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v0 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v0, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + +inline void +Hacl_AES_256_CTR32_NI_aes256_ctr_encrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 ctx[16U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex0 = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n10 = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *next0 = kex0; + Lib_IntVector_Intrinsics_vec128 *next1 = kex0 + klen; + next0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k); + next1[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 *prev0 = next0; + Lib_IntVector_Intrinsics_vec128 *prev1 = next1; + Lib_IntVector_Intrinsics_vec128 *next01 = kex0 + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next11 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x01U); + next01[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key = prev0[0U]; + Lib_IntVector_Intrinsics_vec128 + key1 = + Lib_IntVector_Intrinsics_vec128_xor(key, + Lib_IntVector_Intrinsics_vec128_shift_left(key, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + next01[0U] = Lib_IntVector_Intrinsics_vec128_xor(next01[0U], key3); + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next01[0U], (uint8_t)0U); + next11[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key0 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key10 = + Lib_IntVector_Intrinsics_vec128_xor(key0, + Lib_IntVector_Intrinsics_vec128_shift_left(key0, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + next11[0U] = Lib_IntVector_Intrinsics_vec128_xor(next11[0U], key30); + Lib_IntVector_Intrinsics_vec128 *prev01 = next01; + Lib_IntVector_Intrinsics_vec128 *prev11 = next11; + Lib_IntVector_Intrinsics_vec128 *next02 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next12 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev11[0U], (uint8_t)0x02U); + next02[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key4 = prev01[0U]; + Lib_IntVector_Intrinsics_vec128 + key11 = + Lib_IntVector_Intrinsics_vec128_xor(key4, + Lib_IntVector_Intrinsics_vec128_shift_left(key4, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + next02[0U] = Lib_IntVector_Intrinsics_vec128_xor(next02[0U], key31); + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next02[0U], (uint8_t)0U); + next12[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key5 = prev11[0U]; + Lib_IntVector_Intrinsics_vec128 + key12 = + Lib_IntVector_Intrinsics_vec128_xor(key5, + Lib_IntVector_Intrinsics_vec128_shift_left(key5, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + next12[0U] = Lib_IntVector_Intrinsics_vec128_xor(next12[0U], key32); + Lib_IntVector_Intrinsics_vec128 *prev02 = next02; + Lib_IntVector_Intrinsics_vec128 *prev12 = next12; + Lib_IntVector_Intrinsics_vec128 *next03 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next13 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev12[0U], (uint8_t)0x04U); + next03[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key6 = prev02[0U]; + Lib_IntVector_Intrinsics_vec128 + key13 = + Lib_IntVector_Intrinsics_vec128_xor(key6, + Lib_IntVector_Intrinsics_vec128_shift_left(key6, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + next03[0U] = Lib_IntVector_Intrinsics_vec128_xor(next03[0U], key33); + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next03[0U], (uint8_t)0U); + next13[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key7 = prev12[0U]; + Lib_IntVector_Intrinsics_vec128 + key14 = + Lib_IntVector_Intrinsics_vec128_xor(key7, + Lib_IntVector_Intrinsics_vec128_shift_left(key7, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + next13[0U] = Lib_IntVector_Intrinsics_vec128_xor(next13[0U], key34); + Lib_IntVector_Intrinsics_vec128 *prev03 = next03; + Lib_IntVector_Intrinsics_vec128 *prev13 = next13; + Lib_IntVector_Intrinsics_vec128 *next04 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next14 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v9 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev13[0U], (uint8_t)0x08U); + next04[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v9, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key8 = prev03[0U]; + Lib_IntVector_Intrinsics_vec128 + key15 = + Lib_IntVector_Intrinsics_vec128_xor(key8, + Lib_IntVector_Intrinsics_vec128_shift_left(key8, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + next04[0U] = Lib_IntVector_Intrinsics_vec128_xor(next04[0U], key35); + Lib_IntVector_Intrinsics_vec128 + v10 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next04[0U], (uint8_t)0U); + next14[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v10, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key9 = prev13[0U]; + Lib_IntVector_Intrinsics_vec128 + key16 = + Lib_IntVector_Intrinsics_vec128_xor(key9, + Lib_IntVector_Intrinsics_vec128_shift_left(key9, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + next14[0U] = Lib_IntVector_Intrinsics_vec128_xor(next14[0U], key36); + Lib_IntVector_Intrinsics_vec128 *prev04 = next04; + Lib_IntVector_Intrinsics_vec128 *prev14 = next14; + Lib_IntVector_Intrinsics_vec128 *next05 = kex0 + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 *next15 = kex0 + klen * (uint32_t)11U; + Lib_IntVector_Intrinsics_vec128 + v12 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev14[0U], (uint8_t)0x10U); + next05[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v12, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev04[0U]; + Lib_IntVector_Intrinsics_vec128 + key18 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + next05[0U] = Lib_IntVector_Intrinsics_vec128_xor(next05[0U], key37); + Lib_IntVector_Intrinsics_vec128 + v13 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next05[0U], (uint8_t)0U); + next15[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v13, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key19 = prev14[0U]; + Lib_IntVector_Intrinsics_vec128 + key110 = + Lib_IntVector_Intrinsics_vec128_xor(key19, + Lib_IntVector_Intrinsics_vec128_shift_left(key19, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key110, + Lib_IntVector_Intrinsics_vec128_shift_left(key110, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + next15[0U] = Lib_IntVector_Intrinsics_vec128_xor(next15[0U], key38); + Lib_IntVector_Intrinsics_vec128 *prev05 = next05; + Lib_IntVector_Intrinsics_vec128 *prev15 = next15; + Lib_IntVector_Intrinsics_vec128 *next06 = kex0 + klen * (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128 *next16 = kex0 + klen * (uint32_t)13U; + Lib_IntVector_Intrinsics_vec128 + v14 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev15[0U], (uint8_t)0x20U); + next06[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v14, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key29 = prev05[0U]; + Lib_IntVector_Intrinsics_vec128 + key111 = + Lib_IntVector_Intrinsics_vec128_xor(key29, + Lib_IntVector_Intrinsics_vec128_shift_left(key29, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key210 = + Lib_IntVector_Intrinsics_vec128_xor(key111, + Lib_IntVector_Intrinsics_vec128_shift_left(key111, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key39 = + Lib_IntVector_Intrinsics_vec128_xor(key210, + Lib_IntVector_Intrinsics_vec128_shift_left(key210, (uint32_t)32U)); + next06[0U] = Lib_IntVector_Intrinsics_vec128_xor(next06[0U], key39); + Lib_IntVector_Intrinsics_vec128 + v15 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next06[0U], (uint8_t)0U); + next16[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v15, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key40 = prev15[0U]; + Lib_IntVector_Intrinsics_vec128 + key112 = + Lib_IntVector_Intrinsics_vec128_xor(key40, + Lib_IntVector_Intrinsics_vec128_shift_left(key40, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key211 = + Lib_IntVector_Intrinsics_vec128_xor(key112, + Lib_IntVector_Intrinsics_vec128_shift_left(key112, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key310 = + Lib_IntVector_Intrinsics_vec128_xor(key211, + Lib_IntVector_Intrinsics_vec128_shift_left(key211, (uint32_t)32U)); + next16[0U] = Lib_IntVector_Intrinsics_vec128_xor(next16[0U], key310); + Lib_IntVector_Intrinsics_vec128 *prev06 = next06; + Lib_IntVector_Intrinsics_vec128 *prev16 = next16; + Lib_IntVector_Intrinsics_vec128 *next07 = kex0 + klen * (uint32_t)14U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev16[0U], (uint8_t)0x40U); + next07[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key41 = prev06[0U]; + Lib_IntVector_Intrinsics_vec128 + key113 = + Lib_IntVector_Intrinsics_vec128_xor(key41, + Lib_IntVector_Intrinsics_vec128_shift_left(key41, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key212 = + Lib_IntVector_Intrinsics_vec128_xor(key113, + Lib_IntVector_Intrinsics_vec128_shift_left(key113, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key311 = + Lib_IntVector_Intrinsics_vec128_xor(key212, + Lib_IntVector_Intrinsics_vec128_shift_left(key212, (uint32_t)32U)); + next07[0U] = Lib_IntVector_Intrinsics_vec128_xor(next07[0U], key311); + uint8_t nb[16U] = { 0U }; + memcpy(nb, n, (uint32_t)12U * sizeof (uint8_t)); + n10[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = c + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i0, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = c + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + +inline void +Hacl_AES_256_CTR32_NI_aes256_ctr_decrypt( + uint32_t len, + uint8_t *out, + uint8_t *inp, + uint8_t *k, + uint8_t *n, + uint32_t c +) +{ + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 ctx[16U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex0 = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n10 = ctx; + uint32_t klen = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *next0 = kex0; + Lib_IntVector_Intrinsics_vec128 *next1 = kex0 + klen; + next0[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k); + next1[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(k + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 *prev0 = next0; + Lib_IntVector_Intrinsics_vec128 *prev1 = next1; + Lib_IntVector_Intrinsics_vec128 *next01 = kex0 + klen * (uint32_t)2U; + Lib_IntVector_Intrinsics_vec128 *next11 = kex0 + klen * (uint32_t)3U; + Lib_IntVector_Intrinsics_vec128 + v0 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev1[0U], (uint8_t)0x01U); + next01[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v0, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key = prev0[0U]; + Lib_IntVector_Intrinsics_vec128 + key1 = + Lib_IntVector_Intrinsics_vec128_xor(key, + Lib_IntVector_Intrinsics_vec128_shift_left(key, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key2 = + Lib_IntVector_Intrinsics_vec128_xor(key1, + Lib_IntVector_Intrinsics_vec128_shift_left(key1, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key3 = + Lib_IntVector_Intrinsics_vec128_xor(key2, + Lib_IntVector_Intrinsics_vec128_shift_left(key2, (uint32_t)32U)); + next01[0U] = Lib_IntVector_Intrinsics_vec128_xor(next01[0U], key3); + Lib_IntVector_Intrinsics_vec128 + v4 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next01[0U], (uint8_t)0U); + next11[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v4, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key0 = prev1[0U]; + Lib_IntVector_Intrinsics_vec128 + key10 = + Lib_IntVector_Intrinsics_vec128_xor(key0, + Lib_IntVector_Intrinsics_vec128_shift_left(key0, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key20 = + Lib_IntVector_Intrinsics_vec128_xor(key10, + Lib_IntVector_Intrinsics_vec128_shift_left(key10, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key30 = + Lib_IntVector_Intrinsics_vec128_xor(key20, + Lib_IntVector_Intrinsics_vec128_shift_left(key20, (uint32_t)32U)); + next11[0U] = Lib_IntVector_Intrinsics_vec128_xor(next11[0U], key30); + Lib_IntVector_Intrinsics_vec128 *prev01 = next01; + Lib_IntVector_Intrinsics_vec128 *prev11 = next11; + Lib_IntVector_Intrinsics_vec128 *next02 = kex0 + klen * (uint32_t)4U; + Lib_IntVector_Intrinsics_vec128 *next12 = kex0 + klen * (uint32_t)5U; + Lib_IntVector_Intrinsics_vec128 + v5 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev11[0U], (uint8_t)0x02U); + next02[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v5, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key4 = prev01[0U]; + Lib_IntVector_Intrinsics_vec128 + key11 = + Lib_IntVector_Intrinsics_vec128_xor(key4, + Lib_IntVector_Intrinsics_vec128_shift_left(key4, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key21 = + Lib_IntVector_Intrinsics_vec128_xor(key11, + Lib_IntVector_Intrinsics_vec128_shift_left(key11, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key31 = + Lib_IntVector_Intrinsics_vec128_xor(key21, + Lib_IntVector_Intrinsics_vec128_shift_left(key21, (uint32_t)32U)); + next02[0U] = Lib_IntVector_Intrinsics_vec128_xor(next02[0U], key31); + Lib_IntVector_Intrinsics_vec128 + v6 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next02[0U], (uint8_t)0U); + next12[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v6, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key5 = prev11[0U]; + Lib_IntVector_Intrinsics_vec128 + key12 = + Lib_IntVector_Intrinsics_vec128_xor(key5, + Lib_IntVector_Intrinsics_vec128_shift_left(key5, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key22 = + Lib_IntVector_Intrinsics_vec128_xor(key12, + Lib_IntVector_Intrinsics_vec128_shift_left(key12, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key32 = + Lib_IntVector_Intrinsics_vec128_xor(key22, + Lib_IntVector_Intrinsics_vec128_shift_left(key22, (uint32_t)32U)); + next12[0U] = Lib_IntVector_Intrinsics_vec128_xor(next12[0U], key32); + Lib_IntVector_Intrinsics_vec128 *prev02 = next02; + Lib_IntVector_Intrinsics_vec128 *prev12 = next12; + Lib_IntVector_Intrinsics_vec128 *next03 = kex0 + klen * (uint32_t)6U; + Lib_IntVector_Intrinsics_vec128 *next13 = kex0 + klen * (uint32_t)7U; + Lib_IntVector_Intrinsics_vec128 + v7 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev12[0U], (uint8_t)0x04U); + next03[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v7, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key6 = prev02[0U]; + Lib_IntVector_Intrinsics_vec128 + key13 = + Lib_IntVector_Intrinsics_vec128_xor(key6, + Lib_IntVector_Intrinsics_vec128_shift_left(key6, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key23 = + Lib_IntVector_Intrinsics_vec128_xor(key13, + Lib_IntVector_Intrinsics_vec128_shift_left(key13, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key33 = + Lib_IntVector_Intrinsics_vec128_xor(key23, + Lib_IntVector_Intrinsics_vec128_shift_left(key23, (uint32_t)32U)); + next03[0U] = Lib_IntVector_Intrinsics_vec128_xor(next03[0U], key33); + Lib_IntVector_Intrinsics_vec128 + v8 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next03[0U], (uint8_t)0U); + next13[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v8, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key7 = prev12[0U]; + Lib_IntVector_Intrinsics_vec128 + key14 = + Lib_IntVector_Intrinsics_vec128_xor(key7, + Lib_IntVector_Intrinsics_vec128_shift_left(key7, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key24 = + Lib_IntVector_Intrinsics_vec128_xor(key14, + Lib_IntVector_Intrinsics_vec128_shift_left(key14, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key34 = + Lib_IntVector_Intrinsics_vec128_xor(key24, + Lib_IntVector_Intrinsics_vec128_shift_left(key24, (uint32_t)32U)); + next13[0U] = Lib_IntVector_Intrinsics_vec128_xor(next13[0U], key34); + Lib_IntVector_Intrinsics_vec128 *prev03 = next03; + Lib_IntVector_Intrinsics_vec128 *prev13 = next13; + Lib_IntVector_Intrinsics_vec128 *next04 = kex0 + klen * (uint32_t)8U; + Lib_IntVector_Intrinsics_vec128 *next14 = kex0 + klen * (uint32_t)9U; + Lib_IntVector_Intrinsics_vec128 + v9 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev13[0U], (uint8_t)0x08U); + next04[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v9, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key8 = prev03[0U]; + Lib_IntVector_Intrinsics_vec128 + key15 = + Lib_IntVector_Intrinsics_vec128_xor(key8, + Lib_IntVector_Intrinsics_vec128_shift_left(key8, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key25 = + Lib_IntVector_Intrinsics_vec128_xor(key15, + Lib_IntVector_Intrinsics_vec128_shift_left(key15, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key35 = + Lib_IntVector_Intrinsics_vec128_xor(key25, + Lib_IntVector_Intrinsics_vec128_shift_left(key25, (uint32_t)32U)); + next04[0U] = Lib_IntVector_Intrinsics_vec128_xor(next04[0U], key35); + Lib_IntVector_Intrinsics_vec128 + v10 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next04[0U], (uint8_t)0U); + next14[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v10, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key9 = prev13[0U]; + Lib_IntVector_Intrinsics_vec128 + key16 = + Lib_IntVector_Intrinsics_vec128_xor(key9, + Lib_IntVector_Intrinsics_vec128_shift_left(key9, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key26 = + Lib_IntVector_Intrinsics_vec128_xor(key16, + Lib_IntVector_Intrinsics_vec128_shift_left(key16, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key36 = + Lib_IntVector_Intrinsics_vec128_xor(key26, + Lib_IntVector_Intrinsics_vec128_shift_left(key26, (uint32_t)32U)); + next14[0U] = Lib_IntVector_Intrinsics_vec128_xor(next14[0U], key36); + Lib_IntVector_Intrinsics_vec128 *prev04 = next04; + Lib_IntVector_Intrinsics_vec128 *prev14 = next14; + Lib_IntVector_Intrinsics_vec128 *next05 = kex0 + klen * (uint32_t)10U; + Lib_IntVector_Intrinsics_vec128 *next15 = kex0 + klen * (uint32_t)11U; + Lib_IntVector_Intrinsics_vec128 + v12 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev14[0U], (uint8_t)0x10U); + next05[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v12, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key17 = prev04[0U]; + Lib_IntVector_Intrinsics_vec128 + key18 = + Lib_IntVector_Intrinsics_vec128_xor(key17, + Lib_IntVector_Intrinsics_vec128_shift_left(key17, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key27 = + Lib_IntVector_Intrinsics_vec128_xor(key18, + Lib_IntVector_Intrinsics_vec128_shift_left(key18, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key37 = + Lib_IntVector_Intrinsics_vec128_xor(key27, + Lib_IntVector_Intrinsics_vec128_shift_left(key27, (uint32_t)32U)); + next05[0U] = Lib_IntVector_Intrinsics_vec128_xor(next05[0U], key37); + Lib_IntVector_Intrinsics_vec128 + v13 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next05[0U], (uint8_t)0U); + next15[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v13, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key19 = prev14[0U]; + Lib_IntVector_Intrinsics_vec128 + key110 = + Lib_IntVector_Intrinsics_vec128_xor(key19, + Lib_IntVector_Intrinsics_vec128_shift_left(key19, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key28 = + Lib_IntVector_Intrinsics_vec128_xor(key110, + Lib_IntVector_Intrinsics_vec128_shift_left(key110, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key38 = + Lib_IntVector_Intrinsics_vec128_xor(key28, + Lib_IntVector_Intrinsics_vec128_shift_left(key28, (uint32_t)32U)); + next15[0U] = Lib_IntVector_Intrinsics_vec128_xor(next15[0U], key38); + Lib_IntVector_Intrinsics_vec128 *prev05 = next05; + Lib_IntVector_Intrinsics_vec128 *prev15 = next15; + Lib_IntVector_Intrinsics_vec128 *next06 = kex0 + klen * (uint32_t)12U; + Lib_IntVector_Intrinsics_vec128 *next16 = kex0 + klen * (uint32_t)13U; + Lib_IntVector_Intrinsics_vec128 + v14 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev15[0U], (uint8_t)0x20U); + next06[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v14, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key29 = prev05[0U]; + Lib_IntVector_Intrinsics_vec128 + key111 = + Lib_IntVector_Intrinsics_vec128_xor(key29, + Lib_IntVector_Intrinsics_vec128_shift_left(key29, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key210 = + Lib_IntVector_Intrinsics_vec128_xor(key111, + Lib_IntVector_Intrinsics_vec128_shift_left(key111, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key39 = + Lib_IntVector_Intrinsics_vec128_xor(key210, + Lib_IntVector_Intrinsics_vec128_shift_left(key210, (uint32_t)32U)); + next06[0U] = Lib_IntVector_Intrinsics_vec128_xor(next06[0U], key39); + Lib_IntVector_Intrinsics_vec128 + v15 = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(next06[0U], (uint8_t)0U); + next16[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v15, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U, + (uint32_t)2U); + Lib_IntVector_Intrinsics_vec128 key40 = prev15[0U]; + Lib_IntVector_Intrinsics_vec128 + key112 = + Lib_IntVector_Intrinsics_vec128_xor(key40, + Lib_IntVector_Intrinsics_vec128_shift_left(key40, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key211 = + Lib_IntVector_Intrinsics_vec128_xor(key112, + Lib_IntVector_Intrinsics_vec128_shift_left(key112, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key310 = + Lib_IntVector_Intrinsics_vec128_xor(key211, + Lib_IntVector_Intrinsics_vec128_shift_left(key211, (uint32_t)32U)); + next16[0U] = Lib_IntVector_Intrinsics_vec128_xor(next16[0U], key310); + Lib_IntVector_Intrinsics_vec128 *prev06 = next06; + Lib_IntVector_Intrinsics_vec128 *prev16 = next16; + Lib_IntVector_Intrinsics_vec128 *next07 = kex0 + klen * (uint32_t)14U; + Lib_IntVector_Intrinsics_vec128 + v = Lib_IntVector_Intrinsics_ni_aes_keygen_assist(prev16[0U], (uint8_t)0x40U); + next07[0U] = + Lib_IntVector_Intrinsics_vec128_shuffle32(v, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U, + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 key41 = prev06[0U]; + Lib_IntVector_Intrinsics_vec128 + key113 = + Lib_IntVector_Intrinsics_vec128_xor(key41, + Lib_IntVector_Intrinsics_vec128_shift_left(key41, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key212 = + Lib_IntVector_Intrinsics_vec128_xor(key113, + Lib_IntVector_Intrinsics_vec128_shift_left(key113, (uint32_t)32U)); + Lib_IntVector_Intrinsics_vec128 + key311 = + Lib_IntVector_Intrinsics_vec128_xor(key212, + Lib_IntVector_Intrinsics_vec128_shift_left(key212, (uint32_t)32U)); + next07[0U] = Lib_IntVector_Intrinsics_vec128_xor(next07[0U], key311); + uint8_t nb[16U] = { 0U }; + memcpy(nb, n, (uint32_t)12U * sizeof (uint8_t)); + n10[0U] = Lib_IntVector_Intrinsics_vec128_load128_le(nb); + uint32_t blocks64 = len / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < blocks64; i++) + { + uint32_t ctr = c + i * (uint32_t)4U; + uint8_t *ib = inp + i * (uint32_t)64U; + uint8_t *ob = out + i * (uint32_t)64U; + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i0, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i0 * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(ib); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(ib + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(ob, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(ob + (uint32_t)48U, v31); + } + uint32_t rem = len % (uint32_t)64U; + uint8_t last[64U] = { 0U }; + if (rem > (uint32_t)0U) + { + uint32_t ctr = c + blocks64 * (uint32_t)4U; + uint8_t *ib = inp + blocks64 * (uint32_t)64U; + uint8_t *ob = out + blocks64 * (uint32_t)64U; + memcpy(last, ib, rem * sizeof (uint8_t)); + KRML_PRE_ALIGN(16) Lib_IntVector_Intrinsics_vec128 st[4U] KRML_POST_ALIGN(16) = { 0U }; + Lib_IntVector_Intrinsics_vec128 *kex = ctx + (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *n1 = ctx; + uint32_t counter0 = htobe32(ctr); + uint32_t counter1 = htobe32(ctr + (uint32_t)1U); + uint32_t counter2 = htobe32(ctr + (uint32_t)2U); + uint32_t counter3 = htobe32(ctr + (uint32_t)3U); + Lib_IntVector_Intrinsics_vec128 nonce0 = n1[0U]; + st[0U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter0, (uint32_t)3U); + st[1U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter1, (uint32_t)3U); + st[2U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter2, (uint32_t)3U); + st[3U] = Lib_IntVector_Intrinsics_vec128_insert32(nonce0, counter3, (uint32_t)3U); + uint32_t klen0 = (uint32_t)1U; + Lib_IntVector_Intrinsics_vec128 *k0 = kex; + Lib_IntVector_Intrinsics_vec128 *kr = kex + klen0; + Lib_IntVector_Intrinsics_vec128 *kn = kex + (uint32_t)14U * klen0; + st[0U] = Lib_IntVector_Intrinsics_vec128_xor(st[0U], k0[0U]); + st[1U] = Lib_IntVector_Intrinsics_vec128_xor(st[1U], k0[0U]); + st[2U] = Lib_IntVector_Intrinsics_vec128_xor(st[2U], k0[0U]); + st[3U] = Lib_IntVector_Intrinsics_vec128_xor(st[3U], k0[0U]); + KRML_MAYBE_FOR13(i, + (uint32_t)0U, + (uint32_t)13U, + (uint32_t)1U, + Lib_IntVector_Intrinsics_vec128 *sub_key = kr + i * (uint32_t)1U; + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[0U], sub_key[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[1U], sub_key[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[2U], sub_key[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc(st[3U], sub_key[0U]);); + st[0U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[0U], kn[0U]); + st[1U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[1U], kn[0U]); + st[2U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[2U], kn[0U]); + st[3U] = Lib_IntVector_Intrinsics_ni_aes_enc_last(st[3U], kn[0U]); + Lib_IntVector_Intrinsics_vec128 v00 = Lib_IntVector_Intrinsics_vec128_load128_le(last); + Lib_IntVector_Intrinsics_vec128 + v1 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)16U); + Lib_IntVector_Intrinsics_vec128 + v2 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)32U); + Lib_IntVector_Intrinsics_vec128 + v3 = Lib_IntVector_Intrinsics_vec128_load128_le(last + (uint32_t)48U); + Lib_IntVector_Intrinsics_vec128 v01 = Lib_IntVector_Intrinsics_vec128_xor(v00, st[0U]); + Lib_IntVector_Intrinsics_vec128 v11 = Lib_IntVector_Intrinsics_vec128_xor(v1, st[1U]); + Lib_IntVector_Intrinsics_vec128 v21 = Lib_IntVector_Intrinsics_vec128_xor(v2, st[2U]); + Lib_IntVector_Intrinsics_vec128 v31 = Lib_IntVector_Intrinsics_vec128_xor(v3, st[3U]); + Lib_IntVector_Intrinsics_vec128_store128_le(last, v01); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)16U, v11); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)32U, v21); + Lib_IntVector_Intrinsics_vec128_store128_le(last + (uint32_t)48U, v31); + memcpy(ob, last, rem * sizeof (uint8_t)); + } +} + diff --git a/src/msvc/Hacl_AES_128_GCM_M32.c b/src/msvc/Hacl_AES_256_GCM_CT64.c similarity index 64% rename from src/msvc/Hacl_AES_128_GCM_M32.c rename to src/msvc/Hacl_AES_256_GCM_CT64.c index bd172a0e..436f2318 100644 --- a/src/msvc/Hacl_AES_128_GCM_M32.c +++ b/src/msvc/Hacl_AES_256_GCM_CT64.c @@ -23,25 +23,23 @@ */ -#include "Hacl_AES_128_GCM_M32.h" +#include "Hacl_AES_256_GCM_CT64.h" -#include "internal/Hacl_AES_128_BitSlice.h" +uint32_t Hacl_AES_256_GCM_CT64_aes_gcm_ctx_len = (uint32_t)148U; -uint32_t Hacl_AES_128_GCM_M32_aes_gcm_ctx_len = (uint32_t)396U; - -void Hacl_AES_128_GCM_M32_aes128_gcm_init(uint64_t *ctx, uint8_t *key) +void Hacl_AES_256_GCM_CT64_aes256_gcm_init(uint64_t *ctx, uint8_t *key) { uint8_t gcm_key[16U] = { 0U }; uint8_t nonce0[12U] = { 0U }; uint64_t *aes_ctx = ctx; uint64_t *gcm_ctx = ctx + (uint32_t)128U; - Hacl_AES_128_BitSlice_aes128_init(aes_ctx, key, nonce0); - Hacl_AES_128_BitSlice_aes128_key_block(gcm_key, aes_ctx, (uint32_t)0U); - Hacl_Gf128_PreComp_gcm_init(gcm_ctx, gcm_key); + Hacl_AES_256_CTR32_BitSlice_aes256_init(aes_ctx, key, nonce0); + Hacl_AES_256_CTR32_BitSlice_aes256_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Hacl_Gf128_CT64_gcm_init(gcm_ctx, gcm_key); } void -Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( +Hacl_AES_256_GCM_CT64_aes256_gcm_encrypt( uint64_t *ctx, uint32_t len, uint8_t *out, @@ -56,7 +54,7 @@ Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( uint8_t *cip = out; uint64_t *aes_ctx = ctx; uint64_t *gcm_ctx = ctx + (uint32_t)128U; - uint64_t *tag_mix = ctx + (uint32_t)394U; + uint64_t *tag_mix = ctx + (uint32_t)146U; uint32_t ctr; uint8_t tag_mix10[16U] = { 0U }; uint8_t gcm_key[16U] = { 0U }; @@ -66,12 +64,12 @@ Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( if (iv_len == (uint32_t)12U) { uint64_t *aes_ctx1 = ctx; - Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, iv); - Hacl_AES_128_BitSlice_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); + Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(aes_ctx1, iv); + Hacl_AES_256_CTR32_BitSlice_aes256_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); uint64_t u = load64_le(tag_mix10); - ctx[394U] = u; + ctx[146U] = u; uint64_t u0 = load64_le(tag_mix10 + (uint32_t)8U); - ctx[395U] = u0; + ctx[147U] = u0; ctr = (uint32_t)2U; } else @@ -80,31 +78,33 @@ Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( uint64_t *gcm_ctx1 = ctx + (uint32_t)128U; store64_be(gcm_key + (uint32_t)8U, gcm_ctx1[8U]); store64_be(gcm_key, gcm_ctx1[9U]); - Hacl_Gf128_PreComp_ghash(tag_iv, iv_len, iv, gcm_key); + Hacl_Gf128_CT64_ghash(tag_iv, iv_len, iv, gcm_key); store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); KRML_MAYBE_FOR16(i, (uint32_t)0U, (uint32_t)16U, (uint32_t)1U, size_iv[i] = tag_iv[i] ^ size_iv[i];); - Hacl_Gf128_PreComp_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); - Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, tag_iv); + Hacl_Gf128_CT64_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(aes_ctx1, tag_iv); uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); uint32_t ctr0 = u0; - Hacl_AES_128_BitSlice_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + Hacl_AES_256_CTR32_BitSlice_aes256_key_block(tag_mix1, aes_ctx1, ctr0); uint64_t u = load64_le(tag_mix1); - ctx[394U] = u; + ctx[146U] = u; uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); - ctx[395U] = u1; + ctx[147U] = u1; ctr = ctr0 + (uint32_t)1U; } - Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, cip, text, aes_ctx, ctr); - Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); - Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, len, cip); + Hacl_AES_256_CTR32_BitSlice_aes256_ctr(len, cip, text, aes_ctx, ctr); + gcm_ctx[0U] = (uint64_t)0U; + gcm_ctx[1U] = (uint64_t)0U; + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, len, cip); store64_be(tmp, (uint64_t)(aad_len * (uint32_t)8U)); store64_be(tmp + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); - Hacl_Gf128_PreComp_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); - Hacl_Gf128_PreComp_gcm_emit(tmp, gcm_ctx); + Hacl_Gf128_CT64_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); + Hacl_Gf128_CT64_gcm_emit(tmp, gcm_ctx); uint64_t u0 = load64_le(tmp); uint64_t tmp0 = u0; uint64_t u = load64_le(tmp + (uint32_t)8U); @@ -113,12 +113,10 @@ Hacl_AES_128_GCM_M32_aes128_gcm_encrypt( uint64_t tmp11 = tmp1 ^ tag_mix[1U]; store64_le(out + len, tmp01); store64_le(out + len + (uint32_t)8U, tmp11); - gcm_ctx[0U] = (uint64_t)0U; - gcm_ctx[1U] = (uint64_t)0U; } bool -Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( +Hacl_AES_256_GCM_CT64_aes256_gcm_decrypt( uint64_t *ctx, uint32_t len, uint8_t *out, @@ -134,56 +132,58 @@ Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( uint8_t *result = scratch + (uint32_t)17U; uint8_t *ciphertext = cipher; uint8_t *tag = cipher + len; - uint64_t *aes_ctx = ctx; - uint64_t *gcm_ctx = ctx + (uint32_t)128U; - uint64_t *tag_mix = ctx + (uint32_t)394U; uint32_t ctr; - uint8_t tag_mix10[16U] = { 0U }; + uint8_t tag_mix0[16U] = { 0U }; uint8_t gcm_key[16U] = { 0U }; uint8_t tag_iv[16U] = { 0U }; uint8_t size_iv[16U] = { 0U }; uint8_t tag_mix1[16U] = { 0U }; if (iv_len == (uint32_t)12U) { - uint64_t *aes_ctx1 = ctx; - Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, iv); - Hacl_AES_128_BitSlice_aes128_key_block(tag_mix10, aes_ctx1, (uint32_t)1U); - uint64_t u = load64_le(tag_mix10); - ctx[394U] = u; - uint64_t u0 = load64_le(tag_mix10 + (uint32_t)8U); - ctx[395U] = u0; + uint64_t *aes_ctx = ctx; + Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(aes_ctx, iv); + Hacl_AES_256_CTR32_BitSlice_aes256_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + uint64_t u = load64_le(tag_mix0); + ctx[146U] = u; + uint64_t u0 = load64_le(tag_mix0 + (uint32_t)8U); + ctx[147U] = u0; ctr = (uint32_t)2U; } else { - uint64_t *aes_ctx1 = ctx; - uint64_t *gcm_ctx1 = ctx + (uint32_t)128U; - store64_be(gcm_key + (uint32_t)8U, gcm_ctx1[8U]); - store64_be(gcm_key, gcm_ctx1[9U]); - Hacl_Gf128_PreComp_ghash(tag_iv, iv_len, iv, gcm_key); + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)128U; + store64_be(gcm_key + (uint32_t)8U, gcm_ctx[8U]); + store64_be(gcm_key, gcm_ctx[9U]); + Hacl_Gf128_CT64_ghash(tag_iv, iv_len, iv, gcm_key); store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); KRML_MAYBE_FOR16(i, (uint32_t)0U, (uint32_t)16U, (uint32_t)1U, size_iv[i] = tag_iv[i] ^ size_iv[i];); - Hacl_Gf128_PreComp_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); - Hacl_AES_128_BitSlice_aes128_set_nonce(aes_ctx1, tag_iv); + Hacl_Gf128_CT64_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_256_CTR32_BitSlice_aes256_set_nonce(aes_ctx, tag_iv); uint32_t u0 = load32_be(tag_iv + (uint32_t)12U); uint32_t ctr0 = u0; - Hacl_AES_128_BitSlice_aes128_key_block(tag_mix1, aes_ctx1, ctr0); + Hacl_AES_256_CTR32_BitSlice_aes256_key_block(tag_mix1, aes_ctx, ctr0); uint64_t u = load64_le(tag_mix1); - ctx[394U] = u; + ctx[146U] = u; uint64_t u1 = load64_le(tag_mix1 + (uint32_t)8U); - ctx[395U] = u1; + ctx[147U] = u1; ctr = ctr0 + (uint32_t)1U; } - Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); - Hacl_Gf128_PreComp_gcm_update_blocks_padded(gcm_ctx, len, ciphertext); + uint64_t *aes_ctx = ctx; + uint64_t *gcm_ctx = ctx + (uint32_t)128U; + uint64_t *tag_mix = ctx + (uint32_t)146U; + gcm_ctx[0U] = (uint64_t)0U; + gcm_ctx[1U] = (uint64_t)0U; + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_CT64_gcm_update_blocks_padded(gcm_ctx, len, ciphertext); store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); store64_be(text + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); - Hacl_Gf128_PreComp_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); - Hacl_Gf128_PreComp_gcm_emit(text, gcm_ctx); + Hacl_Gf128_CT64_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); + Hacl_Gf128_CT64_gcm_emit(text, gcm_ctx); uint64_t u0 = load64_le(text); uint64_t text0 = u0; uint64_t u = load64_le(text + (uint32_t)8U); @@ -200,7 +200,7 @@ Hacl_AES_128_GCM_M32_aes128_gcm_decrypt( uint8_t res8 = result[0U]; if (res8 == (uint8_t)0U) { - Hacl_Impl_AES_Generic_aes128_ctr_bitslice(len, out, ciphertext, aes_ctx, ctr); + Hacl_AES_256_CTR32_BitSlice_aes256_ctr(len, out, ciphertext, aes_ctx, ctr); return true; } return false; diff --git a/src/msvc/Hacl_AES_256_GCM_NI.c b/src/msvc/Hacl_AES_256_GCM_NI.c new file mode 100644 index 00000000..7c415057 --- /dev/null +++ b/src/msvc/Hacl_AES_256_GCM_NI.c @@ -0,0 +1,182 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_AES_256_GCM_NI.h" + +void Hacl_AES_256_GCM_NI_aes256_gcm_init(Lib_IntVector_Intrinsics_vec128 *ctx, uint8_t *key) +{ + uint8_t gcm_key[16U] = { 0U }; + uint8_t nonce0[12U] = { 0U }; + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Hacl_AES_256_CTR32_NI_aes256_init(aes_ctx, key, nonce0); + Hacl_AES_256_CTR32_NI_aes256_key_block(gcm_key, aes_ctx, (uint32_t)0U); + Hacl_Gf128_NI_gcm_init(gcm_ctx, gcm_key); +} + +void +Hacl_AES_256_GCM_NI_aes256_gcm_encrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *text, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint32_t ctr; + uint8_t tag_mix0[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Hacl_AES_256_CTR32_NI_aes256_set_nonce(aes_ctx, iv); + Hacl_AES_256_CTR32_NI_aes256_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); + ctr = (uint32_t)2U; + } + else + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx[4U]); + Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_256_CTR32_NI_aes256_set_nonce(aes_ctx, tag_iv); + uint32_t u = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u; + Hacl_AES_256_CTR32_NI_aes256_key_block(tag_mix1, aes_ctx, ctr0); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + ctr = ctr0 + (uint32_t)1U; + } + uint8_t *cip = out; + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Hacl_AES_256_CTR32_NI_aes256_ctr(len, cip, text, aes_ctx, ctr); + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; + gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, cip); + uint8_t tmp[16U] = { 0U }; + store64_be(tmp, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(tmp + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_NI_gcm_update_blocks(gcm_ctx, (uint32_t)16U, tmp); + Hacl_Gf128_NI_gcm_emit(tmp, gcm_ctx); + Lib_IntVector_Intrinsics_vec128 tmp_vec = Lib_IntVector_Intrinsics_vec128_load128_le(tmp); + Lib_IntVector_Intrinsics_vec128 + tmp_vec1 = Lib_IntVector_Intrinsics_vec128_xor(tmp_vec, tag_mix); + Lib_IntVector_Intrinsics_vec128_store128_le(out + len, tmp_vec1); +} + +bool +Hacl_AES_256_GCM_NI_aes256_gcm_decrypt( + Lib_IntVector_Intrinsics_vec128 *ctx, + uint32_t len, + uint8_t *out, + uint8_t *cipher, + uint32_t aad_len, + uint8_t *aad, + uint32_t iv_len, + uint8_t *iv +) +{ + uint8_t scratch[18U] = { 0U }; + uint8_t *text = scratch; + uint8_t *result = scratch + (uint32_t)17U; + uint8_t *ciphertext = cipher; + uint8_t *tag = cipher + len; + uint32_t ctr; + uint8_t tag_mix0[16U] = { 0U }; + uint8_t gcm_key[16U] = { 0U }; + uint8_t tag_iv[16U] = { 0U }; + uint8_t size_iv[16U] = { 0U }; + uint8_t tag_mix1[16U] = { 0U }; + if (iv_len == (uint32_t)12U) + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Hacl_AES_256_CTR32_NI_aes256_set_nonce(aes_ctx, iv); + Hacl_AES_256_CTR32_NI_aes256_key_block(tag_mix0, aes_ctx, (uint32_t)1U); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix0); + ctr = (uint32_t)2U; + } + else + { + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128_store_be(gcm_key, gcm_ctx[4U]); + Hacl_Gf128_NI_ghash(tag_iv, iv_len, iv, gcm_key); + store64_be(size_iv + (uint32_t)8U, (uint64_t)(iv_len * (uint32_t)8U)); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + size_iv[i] = tag_iv[i] ^ size_iv[i];); + Hacl_Gf128_NI_ghash(tag_iv, (uint32_t)16U, size_iv, gcm_key); + Hacl_AES_256_CTR32_NI_aes256_set_nonce(aes_ctx, tag_iv); + uint32_t u = load32_be(tag_iv + (uint32_t)12U); + uint32_t ctr0 = u; + Hacl_AES_256_CTR32_NI_aes256_key_block(tag_mix1, aes_ctx, ctr0); + ctx[21U] = Lib_IntVector_Intrinsics_vec128_load128_le(tag_mix1); + ctr = ctr0 + (uint32_t)1U; + } + Lib_IntVector_Intrinsics_vec128 *aes_ctx = ctx; + Lib_IntVector_Intrinsics_vec128 *gcm_ctx = ctx + (uint32_t)16U; + Lib_IntVector_Intrinsics_vec128 tag_mix = ctx[21U]; + gcm_ctx[0U] = Lib_IntVector_Intrinsics_vec128_zero; + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, aad_len, aad); + Hacl_Gf128_NI_gcm_update_padded(gcm_ctx, len, ciphertext); + store64_be(text, (uint64_t)(aad_len * (uint32_t)8U)); + store64_be(text + (uint32_t)8U, (uint64_t)(len * (uint32_t)8U)); + Hacl_Gf128_NI_gcm_update_blocks(gcm_ctx, (uint32_t)16U, text); + Hacl_Gf128_NI_gcm_emit(text, gcm_ctx); + Lib_IntVector_Intrinsics_vec128 text_vec = Lib_IntVector_Intrinsics_vec128_load128_le(text); + Lib_IntVector_Intrinsics_vec128 + text_vec1 = Lib_IntVector_Intrinsics_vec128_xor(text_vec, tag_mix); + Lib_IntVector_Intrinsics_vec128_store128_le(text, text_vec1); + KRML_MAYBE_FOR16(i, + (uint32_t)0U, + (uint32_t)16U, + (uint32_t)1U, + result[0U] = result[0U] | (text[i] ^ tag[i]);); + uint8_t res8 = result[0U]; + if (res8 == (uint8_t)0U) + { + Hacl_AES_256_CTR32_NI_aes256_ctr(len, out, ciphertext, aes_ctx, ctr); + return true; + } + return false; +} + diff --git a/src/msvc/Hacl_Gf128_CT64.c b/src/msvc/Hacl_Gf128_CT64.c new file mode 100644 index 00000000..1d0f8bc6 --- /dev/null +++ b/src/msvc/Hacl_Gf128_CT64.c @@ -0,0 +1,1801 @@ +/* MIT License + * + * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation + * Copyright (c) 2022-2023 HACL* Contributors + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "Hacl_Gf128_CT64.h" + +static inline void fmul0(uint64_t *x, uint64_t *y) +{ + uint64_t uu____0 = y[0U]; + uint64_t + x10 = + (uu____0 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____0 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x20 = + (x10 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x10 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x30 = + (x20 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x20 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x4 = + (x30 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x30 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x5 = + (x4 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x4 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t yr1 = x5 << (uint32_t)32U | x5 >> (uint32_t)32U; + uint64_t uu____1 = y[1U]; + uint64_t + x11 = + (uu____1 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____1 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x21 = + (x11 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x11 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x31 = + (x21 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x21 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x40 = + (x31 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x31 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x50 = + (x40 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x40 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t yr2 = x50 << (uint32_t)32U | x50 >> (uint32_t)32U; + uint64_t uu____2 = x[0U]; + uint64_t uu____3 = x[1U]; + uint64_t uu____4 = y[0U]; + uint64_t uu____5 = y[1U]; + uint64_t uu____6 = y[0U] ^ y[1U]; + uint64_t + x12 = + (uu____2 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____2 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x22 = + (x12 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x12 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x32 = + (x22 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x22 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x41 = + (x32 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x32 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x51 = + (x41 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x41 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y1r = x51 << (uint32_t)32U | x51 >> (uint32_t)32U; + uint64_t + x13 = + (uu____3 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____3 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x23 = + (x13 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x13 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x33 = + (x23 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x23 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x42 = + (x33 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x33 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x52 = + (x42 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x42 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y2r = x52 << (uint32_t)32U | x52 >> (uint32_t)32U; + uint64_t y3 = uu____2 ^ uu____3; + uint64_t y3r = y1r ^ y2r; + uint64_t x00 = uu____2 & (uint64_t)0x1111111111111111U; + uint64_t x14 = uu____2 & (uint64_t)0x2222222222222222U; + uint64_t x24 = uu____2 & (uint64_t)0x4444444444444444U; + uint64_t x34 = uu____2 & (uint64_t)0x8888888888888888U; + uint64_t y00 = uu____4 & (uint64_t)0x1111111111111111U; + uint64_t y10 = uu____4 & (uint64_t)0x2222222222222222U; + uint64_t y20 = uu____4 & (uint64_t)0x4444444444444444U; + uint64_t y310 = uu____4 & (uint64_t)0x8888888888888888U; + uint64_t z0 = x00 * y00 ^ (x14 * y310 ^ (x24 * y20 ^ x34 * y10)); + uint64_t z10 = x00 * y10 ^ (x14 * y00 ^ (x24 * y310 ^ x34 * y20)); + uint64_t z20 = x00 * y20 ^ (x14 * y10 ^ (x24 * y00 ^ x34 * y310)); + uint64_t z30 = x00 * y310 ^ (x14 * y20 ^ (x24 * y10 ^ x34 * y00)); + uint64_t + z00 = + (((z0 & (uint64_t)0x1111111111111111U) | (z10 & (uint64_t)0x2222222222222222U)) + | (z20 & (uint64_t)0x4444444444444444U)) + | (z30 & (uint64_t)0x8888888888888888U); + uint64_t x01 = uu____3 & (uint64_t)0x1111111111111111U; + uint64_t x15 = uu____3 & (uint64_t)0x2222222222222222U; + uint64_t x25 = uu____3 & (uint64_t)0x4444444444444444U; + uint64_t x35 = uu____3 & (uint64_t)0x8888888888888888U; + uint64_t y01 = uu____5 & (uint64_t)0x1111111111111111U; + uint64_t y11 = uu____5 & (uint64_t)0x2222222222222222U; + uint64_t y21 = uu____5 & (uint64_t)0x4444444444444444U; + uint64_t y311 = uu____5 & (uint64_t)0x8888888888888888U; + uint64_t z010 = x01 * y01 ^ (x15 * y311 ^ (x25 * y21 ^ x35 * y11)); + uint64_t z12 = x01 * y11 ^ (x15 * y01 ^ (x25 * y311 ^ x35 * y21)); + uint64_t z22 = x01 * y21 ^ (x15 * y11 ^ (x25 * y01 ^ x35 * y311)); + uint64_t z31 = x01 * y311 ^ (x15 * y21 ^ (x25 * y11 ^ x35 * y01)); + uint64_t + z13 = + (((z010 & (uint64_t)0x1111111111111111U) | (z12 & (uint64_t)0x2222222222222222U)) + | (z22 & (uint64_t)0x4444444444444444U)) + | (z31 & (uint64_t)0x8888888888888888U); + uint64_t x02 = y3 & (uint64_t)0x1111111111111111U; + uint64_t x16 = y3 & (uint64_t)0x2222222222222222U; + uint64_t x26 = y3 & (uint64_t)0x4444444444444444U; + uint64_t x36 = y3 & (uint64_t)0x8888888888888888U; + uint64_t y02 = uu____6 & (uint64_t)0x1111111111111111U; + uint64_t y12 = uu____6 & (uint64_t)0x2222222222222222U; + uint64_t y22 = uu____6 & (uint64_t)0x4444444444444444U; + uint64_t y312 = uu____6 & (uint64_t)0x8888888888888888U; + uint64_t z011 = x02 * y02 ^ (x16 * y312 ^ (x26 * y22 ^ x36 * y12)); + uint64_t z110 = x02 * y12 ^ (x16 * y02 ^ (x26 * y312 ^ x36 * y22)); + uint64_t z23 = x02 * y22 ^ (x16 * y12 ^ (x26 * y02 ^ x36 * y312)); + uint64_t z32 = x02 * y312 ^ (x16 * y22 ^ (x26 * y12 ^ x36 * y02)); + uint64_t + z24 = + (((z011 & (uint64_t)0x1111111111111111U) | (z110 & (uint64_t)0x2222222222222222U)) + | (z23 & (uint64_t)0x4444444444444444U)) + | (z32 & (uint64_t)0x8888888888888888U); + uint64_t x03 = y1r & (uint64_t)0x1111111111111111U; + uint64_t x17 = y1r & (uint64_t)0x2222222222222222U; + uint64_t x27 = y1r & (uint64_t)0x4444444444444444U; + uint64_t x37 = y1r & (uint64_t)0x8888888888888888U; + uint64_t y03 = yr1 & (uint64_t)0x1111111111111111U; + uint64_t y13 = yr1 & (uint64_t)0x2222222222222222U; + uint64_t y23 = yr1 & (uint64_t)0x4444444444444444U; + uint64_t y313 = yr1 & (uint64_t)0x8888888888888888U; + uint64_t z012 = x03 * y03 ^ (x17 * y313 ^ (x27 * y23 ^ x37 * y13)); + uint64_t z111 = x03 * y13 ^ (x17 * y03 ^ (x27 * y313 ^ x37 * y23)); + uint64_t z210 = x03 * y23 ^ (x17 * y13 ^ (x27 * y03 ^ x37 * y313)); + uint64_t z33 = x03 * y313 ^ (x17 * y23 ^ (x27 * y13 ^ x37 * y03)); + uint64_t + z0h = + (((z012 & (uint64_t)0x1111111111111111U) | (z111 & (uint64_t)0x2222222222222222U)) + | (z210 & (uint64_t)0x4444444444444444U)) + | (z33 & (uint64_t)0x8888888888888888U); + uint64_t x04 = y2r & (uint64_t)0x1111111111111111U; + uint64_t x18 = y2r & (uint64_t)0x2222222222222222U; + uint64_t x28 = y2r & (uint64_t)0x4444444444444444U; + uint64_t x38 = y2r & (uint64_t)0x8888888888888888U; + uint64_t y04 = yr2 & (uint64_t)0x1111111111111111U; + uint64_t y14 = yr2 & (uint64_t)0x2222222222222222U; + uint64_t y24 = yr2 & (uint64_t)0x4444444444444444U; + uint64_t y314 = yr2 & (uint64_t)0x8888888888888888U; + uint64_t z013 = x04 * y04 ^ (x18 * y314 ^ (x28 * y24 ^ x38 * y14)); + uint64_t z112 = x04 * y14 ^ (x18 * y04 ^ (x28 * y314 ^ x38 * y24)); + uint64_t z211 = x04 * y24 ^ (x18 * y14 ^ (x28 * y04 ^ x38 * y314)); + uint64_t z34 = x04 * y314 ^ (x18 * y24 ^ (x28 * y14 ^ x38 * y04)); + uint64_t + z1h = + (((z013 & (uint64_t)0x1111111111111111U) | (z112 & (uint64_t)0x2222222222222222U)) + | (z211 & (uint64_t)0x4444444444444444U)) + | (z34 & (uint64_t)0x8888888888888888U); + uint64_t x0 = y3r & (uint64_t)0x1111111111111111U; + uint64_t x19 = y3r & (uint64_t)0x2222222222222222U; + uint64_t x29 = y3r & (uint64_t)0x4444444444444444U; + uint64_t x3 = y3r & (uint64_t)0x8888888888888888U; + uint64_t y0 = (yr1 ^ yr2) & (uint64_t)0x1111111111111111U; + uint64_t y1 = (yr1 ^ yr2) & (uint64_t)0x2222222222222222U; + uint64_t y2 = (yr1 ^ yr2) & (uint64_t)0x4444444444444444U; + uint64_t y31 = (yr1 ^ yr2) & (uint64_t)0x8888888888888888U; + uint64_t z01 = x0 * y0 ^ (x19 * y31 ^ (x29 * y2 ^ x3 * y1)); + uint64_t z11 = x0 * y1 ^ (x19 * y0 ^ (x29 * y31 ^ x3 * y2)); + uint64_t z212 = x0 * y2 ^ (x19 * y1 ^ (x29 * y0 ^ x3 * y31)); + uint64_t z35 = x0 * y31 ^ (x19 * y2 ^ (x29 * y1 ^ x3 * y0)); + uint64_t + z2h = + (((z01 & (uint64_t)0x1111111111111111U) | (z11 & (uint64_t)0x2222222222222222U)) + | (z212 & (uint64_t)0x4444444444444444U)) + | (z35 & (uint64_t)0x8888888888888888U); + uint64_t z21 = z24 ^ (z00 ^ z13); + uint64_t z2h1 = z2h ^ (z0h ^ z1h); + uint64_t + x110 = + (z0h & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z0h >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x210 = + (x110 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x110 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x39 = + (x210 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x210 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x43 = + (x39 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x39 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x53 = + (x43 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x43 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z0h1 = (x53 << (uint32_t)32U | x53 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x111 = + (z1h & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z1h >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x211 = + (x111 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x111 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x310 = + (x211 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x211 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x44 = + (x310 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x310 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x54 = + (x44 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x44 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z1h1 = (x54 << (uint32_t)32U | x54 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x1 = + (z2h1 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z2h1 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x212 = + (x1 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x1 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x311 = + (x212 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x212 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x45 = + (x311 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x311 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x55 = + (x45 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x45 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z2h2 = (x55 << (uint32_t)32U | x55 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t z1 = z00; + uint64_t z2 = z0h1 ^ z21; + uint64_t z3 = z13 ^ z2h2; + uint64_t z4 = z1h1; + uint64_t v3 = z4 << (uint32_t)1U | z3 >> (uint32_t)63U; + uint64_t v2 = z3 << (uint32_t)1U | z2 >> (uint32_t)63U; + uint64_t v1 = z2 << (uint32_t)1U | z1 >> (uint32_t)63U; + uint64_t v0 = z1 << (uint32_t)1U; + uint64_t v21 = v2 ^ (v0 ^ (v0 >> (uint32_t)1U ^ (v0 >> (uint32_t)2U ^ v0 >> (uint32_t)7U))); + uint64_t v11 = v1 ^ (v0 << (uint32_t)63U ^ (v0 << (uint32_t)62U ^ v0 << (uint32_t)57U)); + uint64_t + v31 = v3 ^ (v11 ^ (v11 >> (uint32_t)1U ^ (v11 >> (uint32_t)2U ^ v11 >> (uint32_t)7U))); + uint64_t v22 = v21 ^ (v11 << (uint32_t)63U ^ (v11 << (uint32_t)62U ^ v11 << (uint32_t)57U)); + uint64_t x112 = v22; + uint64_t x2 = v31; + x[0U] = x112; + x[1U] = x2; +} + +static inline void load_precompute_r(uint64_t *pre, uint8_t *key) +{ + uint64_t *h1_0 = pre + (uint32_t)6U; + uint64_t *h2_0 = pre + (uint32_t)4U; + uint64_t *h3_0 = pre + (uint32_t)2U; + uint64_t *h4_0 = pre; + uint64_t u = load64_be(key); + h1_0[1U] = u; + uint64_t u0 = load64_be(key + (uint32_t)8U); + h1_0[0U] = u0; + h2_0[0U] = h1_0[0U]; + h2_0[1U] = h1_0[1U]; + h3_0[0U] = h1_0[0U]; + h3_0[1U] = h1_0[1U]; + h4_0[0U] = h1_0[0U]; + h4_0[1U] = h1_0[1U]; + fmul0(h2_0, h1_0); + fmul0(h3_0, h2_0); + fmul0(h4_0, h3_0); + uint64_t uu____0 = h1_0[0U]; + uint64_t + x = + (uu____0 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____0 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x1 = + (x & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x2 = + (x1 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x1 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x3 = + (x2 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x2 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x4 = + (x3 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x3 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[14U] = x4 << (uint32_t)32U | x4 >> (uint32_t)32U; + uint64_t uu____1 = h1_0[1U]; + uint64_t + x0 = + (uu____1 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____1 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x10 = + (x0 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x0 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x20 = + (x10 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x10 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x30 = + (x20 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x20 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x40 = + (x30 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x30 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[15U] = x40 << (uint32_t)32U | x40 >> (uint32_t)32U; + uint64_t uu____2 = h2_0[0U]; + uint64_t + x5 = + (uu____2 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____2 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x11 = + (x5 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x5 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x21 = + (x11 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x11 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x31 = + (x21 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x21 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x41 = + (x31 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x31 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[12U] = x41 << (uint32_t)32U | x41 >> (uint32_t)32U; + uint64_t uu____3 = h2_0[1U]; + uint64_t + x6 = + (uu____3 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____3 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x12 = + (x6 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x6 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x22 = + (x12 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x12 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x32 = + (x22 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x22 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x42 = + (x32 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x32 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[13U] = x42 << (uint32_t)32U | x42 >> (uint32_t)32U; + uint64_t uu____4 = h3_0[0U]; + uint64_t + x7 = + (uu____4 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____4 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x13 = + (x7 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x7 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x23 = + (x13 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x13 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x33 = + (x23 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x23 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x43 = + (x33 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x33 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[10U] = x43 << (uint32_t)32U | x43 >> (uint32_t)32U; + uint64_t uu____5 = h3_0[1U]; + uint64_t + x8 = + (uu____5 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____5 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x14 = + (x8 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x8 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x24 = + (x14 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x14 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x34 = + (x24 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x24 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x44 = + (x34 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x34 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[11U] = x44 << (uint32_t)32U | x44 >> (uint32_t)32U; + uint64_t uu____6 = h4_0[0U]; + uint64_t + x9 = + (uu____6 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____6 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x15 = + (x9 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x9 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x25 = + (x15 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x15 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x35 = + (x25 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x25 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x45 = + (x35 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x35 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[8U] = x45 << (uint32_t)32U | x45 >> (uint32_t)32U; + uint64_t uu____7 = h4_0[1U]; + uint64_t + x16 = + (uu____7 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____7 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x17 = + (x16 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x16 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x26 = + (x17 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x17 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x36 = + (x26 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x26 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x46 = + (x36 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x36 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + pre[9U] = x46 << (uint32_t)32U | x46 >> (uint32_t)32U; +} + +static inline void normalize4(uint64_t *acc, uint64_t *x, uint64_t *pre) +{ + uint64_t *x1 = x; + uint64_t *x2 = x + (uint32_t)2U; + uint64_t *x3 = x + (uint32_t)4U; + uint64_t *x4 = x + (uint32_t)6U; + uint64_t *y1 = pre; + uint64_t *y2 = pre + (uint32_t)2U; + uint64_t *y3 = pre + (uint32_t)4U; + uint64_t *y4 = pre + (uint32_t)6U; + uint64_t *yr1 = pre + (uint32_t)8U; + uint64_t *yr2 = pre + (uint32_t)10U; + uint64_t *yr3 = pre + (uint32_t)12U; + uint64_t *yr4 = pre + (uint32_t)14U; + uint64_t uu____0 = x1[0U]; + uint64_t uu____1 = x1[1U]; + uint64_t uu____2 = y1[0U]; + uint64_t uu____3 = y1[1U]; + uint64_t uu____4 = y1[0U] ^ y1[1U]; + uint64_t uu____5 = yr1[0U]; + uint64_t uu____6 = yr1[1U]; + uint64_t uu____7 = yr1[0U] ^ yr1[1U]; + uint64_t + x50 = + (uu____0 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____0 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x6 = + (x50 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x50 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x7 = + (x6 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x6 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x8 = + (x7 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x7 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x9 = + (x8 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x8 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y1r = x9 << (uint32_t)32U | x9 >> (uint32_t)32U; + uint64_t + x51 = + (uu____1 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____1 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x60 = + (x51 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x51 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x70 = + (x60 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x60 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x80 = + (x70 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x70 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x90 = + (x80 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x80 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y2r = x90 << (uint32_t)32U | x90 >> (uint32_t)32U; + uint64_t y310 = uu____0 ^ uu____1; + uint64_t y3r0 = y1r ^ y2r; + uint64_t x00 = uu____0 & (uint64_t)0x1111111111111111U; + uint64_t x110 = uu____0 & (uint64_t)0x2222222222222222U; + uint64_t x210 = uu____0 & (uint64_t)0x4444444444444444U; + uint64_t x310 = uu____0 & (uint64_t)0x8888888888888888U; + uint64_t y00 = uu____2 & (uint64_t)0x1111111111111111U; + uint64_t y110 = uu____2 & (uint64_t)0x2222222222222222U; + uint64_t y210 = uu____2 & (uint64_t)0x4444444444444444U; + uint64_t y320 = uu____2 & (uint64_t)0x8888888888888888U; + uint64_t z00 = x00 * y00 ^ (x110 * y320 ^ (x210 * y210 ^ x310 * y110)); + uint64_t z10 = x00 * y110 ^ (x110 * y00 ^ (x210 * y320 ^ x310 * y210)); + uint64_t z20 = x00 * y210 ^ (x110 * y110 ^ (x210 * y00 ^ x310 * y320)); + uint64_t z30 = x00 * y320 ^ (x110 * y210 ^ (x210 * y110 ^ x310 * y00)); + uint64_t + z02 = + (((z00 & (uint64_t)0x1111111111111111U) | (z10 & (uint64_t)0x2222222222222222U)) + | (z20 & (uint64_t)0x4444444444444444U)) + | (z30 & (uint64_t)0x8888888888888888U); + uint64_t x01 = uu____1 & (uint64_t)0x1111111111111111U; + uint64_t x111 = uu____1 & (uint64_t)0x2222222222222222U; + uint64_t x211 = uu____1 & (uint64_t)0x4444444444444444U; + uint64_t x311 = uu____1 & (uint64_t)0x8888888888888888U; + uint64_t y01 = uu____3 & (uint64_t)0x1111111111111111U; + uint64_t y111 = uu____3 & (uint64_t)0x2222222222222222U; + uint64_t y211 = uu____3 & (uint64_t)0x4444444444444444U; + uint64_t y321 = uu____3 & (uint64_t)0x8888888888888888U; + uint64_t z010 = x01 * y01 ^ (x111 * y321 ^ (x211 * y211 ^ x311 * y111)); + uint64_t z14 = x01 * y111 ^ (x111 * y01 ^ (x211 * y321 ^ x311 * y211)); + uint64_t z24 = x01 * y211 ^ (x111 * y111 ^ (x211 * y01 ^ x311 * y321)); + uint64_t z33 = x01 * y321 ^ (x111 * y211 ^ (x211 * y111 ^ x311 * y01)); + uint64_t + z15 = + (((z010 & (uint64_t)0x1111111111111111U) | (z14 & (uint64_t)0x2222222222222222U)) + | (z24 & (uint64_t)0x4444444444444444U)) + | (z33 & (uint64_t)0x8888888888888888U); + uint64_t x02 = y310 & (uint64_t)0x1111111111111111U; + uint64_t x112 = y310 & (uint64_t)0x2222222222222222U; + uint64_t x212 = y310 & (uint64_t)0x4444444444444444U; + uint64_t x312 = y310 & (uint64_t)0x8888888888888888U; + uint64_t y02 = uu____4 & (uint64_t)0x1111111111111111U; + uint64_t y112 = uu____4 & (uint64_t)0x2222222222222222U; + uint64_t y212 = uu____4 & (uint64_t)0x4444444444444444U; + uint64_t y322 = uu____4 & (uint64_t)0x8888888888888888U; + uint64_t z011 = x02 * y02 ^ (x112 * y322 ^ (x212 * y212 ^ x312 * y112)); + uint64_t z110 = x02 * y112 ^ (x112 * y02 ^ (x212 * y322 ^ x312 * y212)); + uint64_t z25 = x02 * y212 ^ (x112 * y112 ^ (x212 * y02 ^ x312 * y322)); + uint64_t z34 = x02 * y322 ^ (x112 * y212 ^ (x212 * y112 ^ x312 * y02)); + uint64_t + z26 = + (((z011 & (uint64_t)0x1111111111111111U) | (z110 & (uint64_t)0x2222222222222222U)) + | (z25 & (uint64_t)0x4444444444444444U)) + | (z34 & (uint64_t)0x8888888888888888U); + uint64_t x03 = y1r & (uint64_t)0x1111111111111111U; + uint64_t x113 = y1r & (uint64_t)0x2222222222222222U; + uint64_t x213 = y1r & (uint64_t)0x4444444444444444U; + uint64_t x313 = y1r & (uint64_t)0x8888888888888888U; + uint64_t y03 = uu____5 & (uint64_t)0x1111111111111111U; + uint64_t y113 = uu____5 & (uint64_t)0x2222222222222222U; + uint64_t y213 = uu____5 & (uint64_t)0x4444444444444444U; + uint64_t y323 = uu____5 & (uint64_t)0x8888888888888888U; + uint64_t z012 = x03 * y03 ^ (x113 * y323 ^ (x213 * y213 ^ x313 * y113)); + uint64_t z111 = x03 * y113 ^ (x113 * y03 ^ (x213 * y323 ^ x313 * y213)); + uint64_t z210 = x03 * y213 ^ (x113 * y113 ^ (x213 * y03 ^ x313 * y323)); + uint64_t z35 = x03 * y323 ^ (x113 * y213 ^ (x213 * y113 ^ x313 * y03)); + uint64_t + z0h = + (((z012 & (uint64_t)0x1111111111111111U) | (z111 & (uint64_t)0x2222222222222222U)) + | (z210 & (uint64_t)0x4444444444444444U)) + | (z35 & (uint64_t)0x8888888888888888U); + uint64_t x04 = y2r & (uint64_t)0x1111111111111111U; + uint64_t x114 = y2r & (uint64_t)0x2222222222222222U; + uint64_t x214 = y2r & (uint64_t)0x4444444444444444U; + uint64_t x314 = y2r & (uint64_t)0x8888888888888888U; + uint64_t y04 = uu____6 & (uint64_t)0x1111111111111111U; + uint64_t y114 = uu____6 & (uint64_t)0x2222222222222222U; + uint64_t y214 = uu____6 & (uint64_t)0x4444444444444444U; + uint64_t y324 = uu____6 & (uint64_t)0x8888888888888888U; + uint64_t z013 = x04 * y04 ^ (x114 * y324 ^ (x214 * y214 ^ x314 * y114)); + uint64_t z112 = x04 * y114 ^ (x114 * y04 ^ (x214 * y324 ^ x314 * y214)); + uint64_t z211 = x04 * y214 ^ (x114 * y114 ^ (x214 * y04 ^ x314 * y324)); + uint64_t z36 = x04 * y324 ^ (x114 * y214 ^ (x214 * y114 ^ x314 * y04)); + uint64_t + z1h = + (((z013 & (uint64_t)0x1111111111111111U) | (z112 & (uint64_t)0x2222222222222222U)) + | (z211 & (uint64_t)0x4444444444444444U)) + | (z36 & (uint64_t)0x8888888888888888U); + uint64_t x05 = y3r0 & (uint64_t)0x1111111111111111U; + uint64_t x115 = y3r0 & (uint64_t)0x2222222222222222U; + uint64_t x215 = y3r0 & (uint64_t)0x4444444444444444U; + uint64_t x315 = y3r0 & (uint64_t)0x8888888888888888U; + uint64_t y05 = uu____7 & (uint64_t)0x1111111111111111U; + uint64_t y115 = uu____7 & (uint64_t)0x2222222222222222U; + uint64_t y215 = uu____7 & (uint64_t)0x4444444444444444U; + uint64_t y325 = uu____7 & (uint64_t)0x8888888888888888U; + uint64_t z014 = x05 * y05 ^ (x115 * y325 ^ (x215 * y215 ^ x315 * y115)); + uint64_t z113 = x05 * y115 ^ (x115 * y05 ^ (x215 * y325 ^ x315 * y215)); + uint64_t z212 = x05 * y215 ^ (x115 * y115 ^ (x215 * y05 ^ x315 * y325)); + uint64_t z37 = x05 * y325 ^ (x115 * y215 ^ (x215 * y115 ^ x315 * y05)); + uint64_t + z2h = + (((z014 & (uint64_t)0x1111111111111111U) | (z113 & (uint64_t)0x2222222222222222U)) + | (z212 & (uint64_t)0x4444444444444444U)) + | (z37 & (uint64_t)0x8888888888888888U); + uint64_t z213 = z26 ^ (z02 ^ z15); + uint64_t z2h10 = z2h ^ (z0h ^ z1h); + uint64_t + x52 = + (z0h & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z0h >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x61 = + (x52 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x52 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x71 = + (x61 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x61 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x81 = + (x71 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x71 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x91 = + (x81 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x81 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z0h1 = (x91 << (uint32_t)32U | x91 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x53 = + (z1h & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z1h >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x62 = + (x53 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x53 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x72 = + (x62 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x62 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x82 = + (x72 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x72 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x92 = + (x82 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x82 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z1h1 = (x92 << (uint32_t)32U | x92 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x54 = + (z2h10 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z2h10 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x63 = + (x54 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x54 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x73 = + (x63 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x63 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x83 = + (x73 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x73 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x93 = + (x83 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x83 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z2h2 = (x93 << (uint32_t)32U | x93 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t z1_1 = z02; + uint64_t z1_2 = z0h1 ^ z213; + uint64_t z1_3 = z15 ^ z2h2; + uint64_t z1_4 = z1h1; + uint64_t uu____8 = x2[0U]; + uint64_t uu____9 = x2[1U]; + uint64_t uu____10 = y2[0U]; + uint64_t uu____11 = y2[1U]; + uint64_t uu____12 = y2[0U] ^ y2[1U]; + uint64_t uu____13 = yr2[0U]; + uint64_t uu____14 = yr2[1U]; + uint64_t uu____15 = yr2[0U] ^ yr2[1U]; + uint64_t + x55 = + (uu____8 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____8 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x64 = + (x55 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x55 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x74 = + (x64 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x64 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x84 = + (x74 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x74 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x94 = + (x84 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x84 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y1r0 = x94 << (uint32_t)32U | x94 >> (uint32_t)32U; + uint64_t + x56 = + (uu____9 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____9 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x65 = + (x56 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x56 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x75 = + (x65 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x65 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x85 = + (x75 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x75 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x95 = + (x85 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x85 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y2r0 = x95 << (uint32_t)32U | x95 >> (uint32_t)32U; + uint64_t y311 = uu____8 ^ uu____9; + uint64_t y3r1 = y1r0 ^ y2r0; + uint64_t x06 = uu____8 & (uint64_t)0x1111111111111111U; + uint64_t x116 = uu____8 & (uint64_t)0x2222222222222222U; + uint64_t x216 = uu____8 & (uint64_t)0x4444444444444444U; + uint64_t x316 = uu____8 & (uint64_t)0x8888888888888888U; + uint64_t y06 = uu____10 & (uint64_t)0x1111111111111111U; + uint64_t y116 = uu____10 & (uint64_t)0x2222222222222222U; + uint64_t y216 = uu____10 & (uint64_t)0x4444444444444444U; + uint64_t y326 = uu____10 & (uint64_t)0x8888888888888888U; + uint64_t z03 = x06 * y06 ^ (x116 * y326 ^ (x216 * y216 ^ x316 * y116)); + uint64_t z16 = x06 * y116 ^ (x116 * y06 ^ (x216 * y326 ^ x316 * y216)); + uint64_t z27 = x06 * y216 ^ (x116 * y116 ^ (x216 * y06 ^ x316 * y326)); + uint64_t z38 = x06 * y326 ^ (x116 * y216 ^ (x216 * y116 ^ x316 * y06)); + uint64_t + z04 = + (((z03 & (uint64_t)0x1111111111111111U) | (z16 & (uint64_t)0x2222222222222222U)) + | (z27 & (uint64_t)0x4444444444444444U)) + | (z38 & (uint64_t)0x8888888888888888U); + uint64_t x07 = uu____9 & (uint64_t)0x1111111111111111U; + uint64_t x117 = uu____9 & (uint64_t)0x2222222222222222U; + uint64_t x217 = uu____9 & (uint64_t)0x4444444444444444U; + uint64_t x317 = uu____9 & (uint64_t)0x8888888888888888U; + uint64_t y07 = uu____11 & (uint64_t)0x1111111111111111U; + uint64_t y117 = uu____11 & (uint64_t)0x2222222222222222U; + uint64_t y217 = uu____11 & (uint64_t)0x4444444444444444U; + uint64_t y327 = uu____11 & (uint64_t)0x8888888888888888U; + uint64_t z015 = x07 * y07 ^ (x117 * y327 ^ (x217 * y217 ^ x317 * y117)); + uint64_t z17 = x07 * y117 ^ (x117 * y07 ^ (x217 * y327 ^ x317 * y217)); + uint64_t z28 = x07 * y217 ^ (x117 * y117 ^ (x217 * y07 ^ x317 * y327)); + uint64_t z39 = x07 * y327 ^ (x117 * y217 ^ (x217 * y117 ^ x317 * y07)); + uint64_t + z18 = + (((z015 & (uint64_t)0x1111111111111111U) | (z17 & (uint64_t)0x2222222222222222U)) + | (z28 & (uint64_t)0x4444444444444444U)) + | (z39 & (uint64_t)0x8888888888888888U); + uint64_t x08 = y311 & (uint64_t)0x1111111111111111U; + uint64_t x118 = y311 & (uint64_t)0x2222222222222222U; + uint64_t x218 = y311 & (uint64_t)0x4444444444444444U; + uint64_t x318 = y311 & (uint64_t)0x8888888888888888U; + uint64_t y08 = uu____12 & (uint64_t)0x1111111111111111U; + uint64_t y118 = uu____12 & (uint64_t)0x2222222222222222U; + uint64_t y218 = uu____12 & (uint64_t)0x4444444444444444U; + uint64_t y328 = uu____12 & (uint64_t)0x8888888888888888U; + uint64_t z016 = x08 * y08 ^ (x118 * y328 ^ (x218 * y218 ^ x318 * y118)); + uint64_t z114 = x08 * y118 ^ (x118 * y08 ^ (x218 * y328 ^ x318 * y218)); + uint64_t z29 = x08 * y218 ^ (x118 * y118 ^ (x218 * y08 ^ x318 * y328)); + uint64_t z310 = x08 * y328 ^ (x118 * y218 ^ (x218 * y118 ^ x318 * y08)); + uint64_t + z214 = + (((z016 & (uint64_t)0x1111111111111111U) | (z114 & (uint64_t)0x2222222222222222U)) + | (z29 & (uint64_t)0x4444444444444444U)) + | (z310 & (uint64_t)0x8888888888888888U); + uint64_t x09 = y1r0 & (uint64_t)0x1111111111111111U; + uint64_t x119 = y1r0 & (uint64_t)0x2222222222222222U; + uint64_t x219 = y1r0 & (uint64_t)0x4444444444444444U; + uint64_t x319 = y1r0 & (uint64_t)0x8888888888888888U; + uint64_t y09 = uu____13 & (uint64_t)0x1111111111111111U; + uint64_t y119 = uu____13 & (uint64_t)0x2222222222222222U; + uint64_t y219 = uu____13 & (uint64_t)0x4444444444444444U; + uint64_t y329 = uu____13 & (uint64_t)0x8888888888888888U; + uint64_t z017 = x09 * y09 ^ (x119 * y329 ^ (x219 * y219 ^ x319 * y119)); + uint64_t z115 = x09 * y119 ^ (x119 * y09 ^ (x219 * y329 ^ x319 * y219)); + uint64_t z215 = x09 * y219 ^ (x119 * y119 ^ (x219 * y09 ^ x319 * y329)); + uint64_t z311 = x09 * y329 ^ (x119 * y219 ^ (x219 * y119 ^ x319 * y09)); + uint64_t + z0h0 = + (((z017 & (uint64_t)0x1111111111111111U) | (z115 & (uint64_t)0x2222222222222222U)) + | (z215 & (uint64_t)0x4444444444444444U)) + | (z311 & (uint64_t)0x8888888888888888U); + uint64_t x010 = y2r0 & (uint64_t)0x1111111111111111U; + uint64_t x1110 = y2r0 & (uint64_t)0x2222222222222222U; + uint64_t x2110 = y2r0 & (uint64_t)0x4444444444444444U; + uint64_t x3110 = y2r0 & (uint64_t)0x8888888888888888U; + uint64_t y010 = uu____14 & (uint64_t)0x1111111111111111U; + uint64_t y1110 = uu____14 & (uint64_t)0x2222222222222222U; + uint64_t y2110 = uu____14 & (uint64_t)0x4444444444444444U; + uint64_t y3210 = uu____14 & (uint64_t)0x8888888888888888U; + uint64_t z018 = x010 * y010 ^ (x1110 * y3210 ^ (x2110 * y2110 ^ x3110 * y1110)); + uint64_t z116 = x010 * y1110 ^ (x1110 * y010 ^ (x2110 * y3210 ^ x3110 * y2110)); + uint64_t z216 = x010 * y2110 ^ (x1110 * y1110 ^ (x2110 * y010 ^ x3110 * y3210)); + uint64_t z312 = x010 * y3210 ^ (x1110 * y2110 ^ (x2110 * y1110 ^ x3110 * y010)); + uint64_t + z1h0 = + (((z018 & (uint64_t)0x1111111111111111U) | (z116 & (uint64_t)0x2222222222222222U)) + | (z216 & (uint64_t)0x4444444444444444U)) + | (z312 & (uint64_t)0x8888888888888888U); + uint64_t x011 = y3r1 & (uint64_t)0x1111111111111111U; + uint64_t x1111 = y3r1 & (uint64_t)0x2222222222222222U; + uint64_t x2111 = y3r1 & (uint64_t)0x4444444444444444U; + uint64_t x3111 = y3r1 & (uint64_t)0x8888888888888888U; + uint64_t y011 = uu____15 & (uint64_t)0x1111111111111111U; + uint64_t y1111 = uu____15 & (uint64_t)0x2222222222222222U; + uint64_t y2111 = uu____15 & (uint64_t)0x4444444444444444U; + uint64_t y3211 = uu____15 & (uint64_t)0x8888888888888888U; + uint64_t z019 = x011 * y011 ^ (x1111 * y3211 ^ (x2111 * y2111 ^ x3111 * y1111)); + uint64_t z117 = x011 * y1111 ^ (x1111 * y011 ^ (x2111 * y3211 ^ x3111 * y2111)); + uint64_t z217 = x011 * y2111 ^ (x1111 * y1111 ^ (x2111 * y011 ^ x3111 * y3211)); + uint64_t z313 = x011 * y3211 ^ (x1111 * y2111 ^ (x2111 * y1111 ^ x3111 * y011)); + uint64_t + z2h0 = + (((z019 & (uint64_t)0x1111111111111111U) | (z117 & (uint64_t)0x2222222222222222U)) + | (z217 & (uint64_t)0x4444444444444444U)) + | (z313 & (uint64_t)0x8888888888888888U); + uint64_t z218 = z214 ^ (z04 ^ z18); + uint64_t z2h11 = z2h0 ^ (z0h0 ^ z1h0); + uint64_t + x57 = + (z0h0 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z0h0 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x66 = + (x57 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x57 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x76 = + (x66 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x66 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x86 = + (x76 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x76 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x96 = + (x86 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x86 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z0h10 = (x96 << (uint32_t)32U | x96 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x58 = + (z1h0 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z1h0 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x67 = + (x58 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x58 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x77 = + (x67 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x67 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x87 = + (x77 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x77 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x97 = + (x87 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x87 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z1h10 = (x97 << (uint32_t)32U | x97 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x59 = + (z2h11 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z2h11 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x68 = + (x59 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x59 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x78 = + (x68 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x68 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x88 = + (x78 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x78 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x98 = + (x88 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x88 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z2h20 = (x98 << (uint32_t)32U | x98 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t z2_1 = z04; + uint64_t z2_2 = z0h10 ^ z218; + uint64_t z2_3 = z18 ^ z2h20; + uint64_t z2_4 = z1h10; + uint64_t z1 = z1_1 ^ z2_1; + uint64_t z2 = z1_2 ^ z2_2; + uint64_t z3 = z1_3 ^ z2_3; + uint64_t z4 = z1_4 ^ z2_4; + uint64_t uu____16 = x3[0U]; + uint64_t uu____17 = x3[1U]; + uint64_t uu____18 = y3[0U]; + uint64_t uu____19 = y3[1U]; + uint64_t uu____20 = y3[0U] ^ y3[1U]; + uint64_t uu____21 = yr3[0U]; + uint64_t uu____22 = yr3[1U]; + uint64_t uu____23 = yr3[0U] ^ yr3[1U]; + uint64_t + x510 = + (uu____16 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____16 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x69 = + (x510 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x510 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x79 = + (x69 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x69 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x89 = + (x79 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x79 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x99 = + (x89 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x89 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y1r1 = x99 << (uint32_t)32U | x99 >> (uint32_t)32U; + uint64_t + x511 = + (uu____17 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____17 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x610 = + (x511 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x511 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x710 = + (x610 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x610 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x810 = + (x710 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x710 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x910 = + (x810 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x810 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y2r1 = x910 << (uint32_t)32U | x910 >> (uint32_t)32U; + uint64_t y312 = uu____16 ^ uu____17; + uint64_t y3r2 = y1r1 ^ y2r1; + uint64_t x012 = uu____16 & (uint64_t)0x1111111111111111U; + uint64_t x1112 = uu____16 & (uint64_t)0x2222222222222222U; + uint64_t x2112 = uu____16 & (uint64_t)0x4444444444444444U; + uint64_t x3112 = uu____16 & (uint64_t)0x8888888888888888U; + uint64_t y012 = uu____18 & (uint64_t)0x1111111111111111U; + uint64_t y1112 = uu____18 & (uint64_t)0x2222222222222222U; + uint64_t y2112 = uu____18 & (uint64_t)0x4444444444444444U; + uint64_t y3212 = uu____18 & (uint64_t)0x8888888888888888U; + uint64_t z05 = x012 * y012 ^ (x1112 * y3212 ^ (x2112 * y2112 ^ x3112 * y1112)); + uint64_t z118 = x012 * y1112 ^ (x1112 * y012 ^ (x2112 * y3212 ^ x3112 * y2112)); + uint64_t z219 = x012 * y2112 ^ (x1112 * y1112 ^ (x2112 * y012 ^ x3112 * y3212)); + uint64_t z314 = x012 * y3212 ^ (x1112 * y2112 ^ (x2112 * y1112 ^ x3112 * y012)); + uint64_t + z06 = + (((z05 & (uint64_t)0x1111111111111111U) | (z118 & (uint64_t)0x2222222222222222U)) + | (z219 & (uint64_t)0x4444444444444444U)) + | (z314 & (uint64_t)0x8888888888888888U); + uint64_t x013 = uu____17 & (uint64_t)0x1111111111111111U; + uint64_t x1113 = uu____17 & (uint64_t)0x2222222222222222U; + uint64_t x2113 = uu____17 & (uint64_t)0x4444444444444444U; + uint64_t x3113 = uu____17 & (uint64_t)0x8888888888888888U; + uint64_t y013 = uu____19 & (uint64_t)0x1111111111111111U; + uint64_t y1113 = uu____19 & (uint64_t)0x2222222222222222U; + uint64_t y2113 = uu____19 & (uint64_t)0x4444444444444444U; + uint64_t y3213 = uu____19 & (uint64_t)0x8888888888888888U; + uint64_t z0110 = x013 * y013 ^ (x1113 * y3213 ^ (x2113 * y2113 ^ x3113 * y1113)); + uint64_t z119 = x013 * y1113 ^ (x1113 * y013 ^ (x2113 * y3213 ^ x3113 * y2113)); + uint64_t z2110 = x013 * y2113 ^ (x1113 * y1113 ^ (x2113 * y013 ^ x3113 * y3213)); + uint64_t z315 = x013 * y3213 ^ (x1113 * y2113 ^ (x2113 * y1113 ^ x3113 * y013)); + uint64_t + z1110 = + (((z0110 & (uint64_t)0x1111111111111111U) | (z119 & (uint64_t)0x2222222222222222U)) + | (z2110 & (uint64_t)0x4444444444444444U)) + | (z315 & (uint64_t)0x8888888888888888U); + uint64_t x014 = y312 & (uint64_t)0x1111111111111111U; + uint64_t x1114 = y312 & (uint64_t)0x2222222222222222U; + uint64_t x2114 = y312 & (uint64_t)0x4444444444444444U; + uint64_t x3114 = y312 & (uint64_t)0x8888888888888888U; + uint64_t y014 = uu____20 & (uint64_t)0x1111111111111111U; + uint64_t y1114 = uu____20 & (uint64_t)0x2222222222222222U; + uint64_t y2114 = uu____20 & (uint64_t)0x4444444444444444U; + uint64_t y3214 = uu____20 & (uint64_t)0x8888888888888888U; + uint64_t z0111 = x014 * y014 ^ (x1114 * y3214 ^ (x2114 * y2114 ^ x3114 * y1114)); + uint64_t z120 = x014 * y1114 ^ (x1114 * y014 ^ (x2114 * y3214 ^ x3114 * y2114)); + uint64_t z2111 = x014 * y2114 ^ (x1114 * y1114 ^ (x2114 * y014 ^ x3114 * y3214)); + uint64_t z316 = x014 * y3214 ^ (x1114 * y2114 ^ (x2114 * y1114 ^ x3114 * y014)); + uint64_t + z2112 = + (((z0111 & (uint64_t)0x1111111111111111U) | (z120 & (uint64_t)0x2222222222222222U)) + | (z2111 & (uint64_t)0x4444444444444444U)) + | (z316 & (uint64_t)0x8888888888888888U); + uint64_t x015 = y1r1 & (uint64_t)0x1111111111111111U; + uint64_t x1115 = y1r1 & (uint64_t)0x2222222222222222U; + uint64_t x2115 = y1r1 & (uint64_t)0x4444444444444444U; + uint64_t x3115 = y1r1 & (uint64_t)0x8888888888888888U; + uint64_t y015 = uu____21 & (uint64_t)0x1111111111111111U; + uint64_t y1115 = uu____21 & (uint64_t)0x2222222222222222U; + uint64_t y2115 = uu____21 & (uint64_t)0x4444444444444444U; + uint64_t y3215 = uu____21 & (uint64_t)0x8888888888888888U; + uint64_t z0112 = x015 * y015 ^ (x1115 * y3215 ^ (x2115 * y2115 ^ x3115 * y1115)); + uint64_t z121 = x015 * y1115 ^ (x1115 * y015 ^ (x2115 * y3215 ^ x3115 * y2115)); + uint64_t z220 = x015 * y2115 ^ (x1115 * y1115 ^ (x2115 * y015 ^ x3115 * y3215)); + uint64_t z317 = x015 * y3215 ^ (x1115 * y2115 ^ (x2115 * y1115 ^ x3115 * y015)); + uint64_t + z0h2 = + (((z0112 & (uint64_t)0x1111111111111111U) | (z121 & (uint64_t)0x2222222222222222U)) + | (z220 & (uint64_t)0x4444444444444444U)) + | (z317 & (uint64_t)0x8888888888888888U); + uint64_t x016 = y2r1 & (uint64_t)0x1111111111111111U; + uint64_t x1116 = y2r1 & (uint64_t)0x2222222222222222U; + uint64_t x2116 = y2r1 & (uint64_t)0x4444444444444444U; + uint64_t x3116 = y2r1 & (uint64_t)0x8888888888888888U; + uint64_t y016 = uu____22 & (uint64_t)0x1111111111111111U; + uint64_t y1116 = uu____22 & (uint64_t)0x2222222222222222U; + uint64_t y2116 = uu____22 & (uint64_t)0x4444444444444444U; + uint64_t y3216 = uu____22 & (uint64_t)0x8888888888888888U; + uint64_t z0113 = x016 * y016 ^ (x1116 * y3216 ^ (x2116 * y2116 ^ x3116 * y1116)); + uint64_t z122 = x016 * y1116 ^ (x1116 * y016 ^ (x2116 * y3216 ^ x3116 * y2116)); + uint64_t z221 = x016 * y2116 ^ (x1116 * y1116 ^ (x2116 * y016 ^ x3116 * y3216)); + uint64_t z318 = x016 * y3216 ^ (x1116 * y2116 ^ (x2116 * y1116 ^ x3116 * y016)); + uint64_t + z1h2 = + (((z0113 & (uint64_t)0x1111111111111111U) | (z122 & (uint64_t)0x2222222222222222U)) + | (z221 & (uint64_t)0x4444444444444444U)) + | (z318 & (uint64_t)0x8888888888888888U); + uint64_t x017 = y3r2 & (uint64_t)0x1111111111111111U; + uint64_t x1117 = y3r2 & (uint64_t)0x2222222222222222U; + uint64_t x2117 = y3r2 & (uint64_t)0x4444444444444444U; + uint64_t x3117 = y3r2 & (uint64_t)0x8888888888888888U; + uint64_t y017 = uu____23 & (uint64_t)0x1111111111111111U; + uint64_t y1117 = uu____23 & (uint64_t)0x2222222222222222U; + uint64_t y2117 = uu____23 & (uint64_t)0x4444444444444444U; + uint64_t y3217 = uu____23 & (uint64_t)0x8888888888888888U; + uint64_t z0114 = x017 * y017 ^ (x1117 * y3217 ^ (x2117 * y2117 ^ x3117 * y1117)); + uint64_t z123 = x017 * y1117 ^ (x1117 * y017 ^ (x2117 * y3217 ^ x3117 * y2117)); + uint64_t z222 = x017 * y2117 ^ (x1117 * y1117 ^ (x2117 * y017 ^ x3117 * y3217)); + uint64_t z319 = x017 * y3217 ^ (x1117 * y2117 ^ (x2117 * y1117 ^ x3117 * y017)); + uint64_t + z2h3 = + (((z0114 & (uint64_t)0x1111111111111111U) | (z123 & (uint64_t)0x2222222222222222U)) + | (z222 & (uint64_t)0x4444444444444444U)) + | (z319 & (uint64_t)0x8888888888888888U); + uint64_t z223 = z2112 ^ (z06 ^ z1110); + uint64_t z2h12 = z2h3 ^ (z0h2 ^ z1h2); + uint64_t + x512 = + (z0h2 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z0h2 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x611 = + (x512 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x512 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x711 = + (x611 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x611 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x811 = + (x711 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x711 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x911 = + (x811 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x811 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z0h11 = (x911 << (uint32_t)32U | x911 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x513 = + (z1h2 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z1h2 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x612 = + (x513 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x513 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x712 = + (x612 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x612 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x812 = + (x712 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x712 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x912 = + (x812 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x812 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z1h11 = (x912 << (uint32_t)32U | x912 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x514 = + (z2h12 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z2h12 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x613 = + (x514 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x514 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x713 = + (x613 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x613 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x813 = + (x713 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x713 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x913 = + (x813 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x813 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z2h21 = (x913 << (uint32_t)32U | x913 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t z3_1 = z06; + uint64_t z3_2 = z0h11 ^ z223; + uint64_t z3_3 = z1110 ^ z2h21; + uint64_t z3_4 = z1h11; + uint64_t z11 = z1 ^ z3_1; + uint64_t z21 = z2 ^ z3_2; + uint64_t z31 = z3 ^ z3_3; + uint64_t z41 = z4 ^ z3_4; + uint64_t uu____24 = x4[0U]; + uint64_t uu____25 = x4[1U]; + uint64_t uu____26 = y4[0U]; + uint64_t uu____27 = y4[1U]; + uint64_t uu____28 = y4[0U] ^ y4[1U]; + uint64_t uu____29 = yr4[0U]; + uint64_t uu____30 = yr4[1U]; + uint64_t uu____31 = yr4[0U] ^ yr4[1U]; + uint64_t + x515 = + (uu____24 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____24 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x614 = + (x515 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x515 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x714 = + (x614 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x614 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x814 = + (x714 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x714 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x914 = + (x814 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x814 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y1r2 = x914 << (uint32_t)32U | x914 >> (uint32_t)32U; + uint64_t + x516 = + (uu____25 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (uu____25 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x615 = + (x516 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x516 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x715 = + (x615 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x615 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x815 = + (x715 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x715 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x915 = + (x815 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x815 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t y2r2 = x915 << (uint32_t)32U | x915 >> (uint32_t)32U; + uint64_t y31 = uu____24 ^ uu____25; + uint64_t y3r = y1r2 ^ y2r2; + uint64_t x018 = uu____24 & (uint64_t)0x1111111111111111U; + uint64_t x1118 = uu____24 & (uint64_t)0x2222222222222222U; + uint64_t x2118 = uu____24 & (uint64_t)0x4444444444444444U; + uint64_t x3118 = uu____24 & (uint64_t)0x8888888888888888U; + uint64_t y018 = uu____26 & (uint64_t)0x1111111111111111U; + uint64_t y1118 = uu____26 & (uint64_t)0x2222222222222222U; + uint64_t y2118 = uu____26 & (uint64_t)0x4444444444444444U; + uint64_t y3218 = uu____26 & (uint64_t)0x8888888888888888U; + uint64_t z0 = x018 * y018 ^ (x1118 * y3218 ^ (x2118 * y2118 ^ x3118 * y1118)); + uint64_t z124 = x018 * y1118 ^ (x1118 * y018 ^ (x2118 * y3218 ^ x3118 * y2118)); + uint64_t z224 = x018 * y2118 ^ (x1118 * y1118 ^ (x2118 * y018 ^ x3118 * y3218)); + uint64_t z320 = x018 * y3218 ^ (x1118 * y2118 ^ (x2118 * y1118 ^ x3118 * y018)); + uint64_t + z07 = + (((z0 & (uint64_t)0x1111111111111111U) | (z124 & (uint64_t)0x2222222222222222U)) + | (z224 & (uint64_t)0x4444444444444444U)) + | (z320 & (uint64_t)0x8888888888888888U); + uint64_t x019 = uu____25 & (uint64_t)0x1111111111111111U; + uint64_t x1119 = uu____25 & (uint64_t)0x2222222222222222U; + uint64_t x2119 = uu____25 & (uint64_t)0x4444444444444444U; + uint64_t x3119 = uu____25 & (uint64_t)0x8888888888888888U; + uint64_t y019 = uu____27 & (uint64_t)0x1111111111111111U; + uint64_t y1119 = uu____27 & (uint64_t)0x2222222222222222U; + uint64_t y2119 = uu____27 & (uint64_t)0x4444444444444444U; + uint64_t y3219 = uu____27 & (uint64_t)0x8888888888888888U; + uint64_t z0115 = x019 * y019 ^ (x1119 * y3219 ^ (x2119 * y2119 ^ x3119 * y1119)); + uint64_t z125 = x019 * y1119 ^ (x1119 * y019 ^ (x2119 * y3219 ^ x3119 * y2119)); + uint64_t z225 = x019 * y2119 ^ (x1119 * y1119 ^ (x2119 * y019 ^ x3119 * y3219)); + uint64_t z321 = x019 * y3219 ^ (x1119 * y2119 ^ (x2119 * y1119 ^ x3119 * y019)); + uint64_t + z126 = + (((z0115 & (uint64_t)0x1111111111111111U) | (z125 & (uint64_t)0x2222222222222222U)) + | (z225 & (uint64_t)0x4444444444444444U)) + | (z321 & (uint64_t)0x8888888888888888U); + uint64_t x020 = y31 & (uint64_t)0x1111111111111111U; + uint64_t x1120 = y31 & (uint64_t)0x2222222222222222U; + uint64_t x2120 = y31 & (uint64_t)0x4444444444444444U; + uint64_t x3120 = y31 & (uint64_t)0x8888888888888888U; + uint64_t y020 = uu____28 & (uint64_t)0x1111111111111111U; + uint64_t y1120 = uu____28 & (uint64_t)0x2222222222222222U; + uint64_t y2120 = uu____28 & (uint64_t)0x4444444444444444U; + uint64_t y3220 = uu____28 & (uint64_t)0x8888888888888888U; + uint64_t z0116 = x020 * y020 ^ (x1120 * y3220 ^ (x2120 * y2120 ^ x3120 * y1120)); + uint64_t z130 = x020 * y1120 ^ (x1120 * y020 ^ (x2120 * y3220 ^ x3120 * y2120)); + uint64_t z226 = x020 * y2120 ^ (x1120 * y1120 ^ (x2120 * y020 ^ x3120 * y3220)); + uint64_t z322 = x020 * y3220 ^ (x1120 * y2120 ^ (x2120 * y1120 ^ x3120 * y020)); + uint64_t + z227 = + (((z0116 & (uint64_t)0x1111111111111111U) | (z130 & (uint64_t)0x2222222222222222U)) + | (z226 & (uint64_t)0x4444444444444444U)) + | (z322 & (uint64_t)0x8888888888888888U); + uint64_t x021 = y1r2 & (uint64_t)0x1111111111111111U; + uint64_t x1121 = y1r2 & (uint64_t)0x2222222222222222U; + uint64_t x2121 = y1r2 & (uint64_t)0x4444444444444444U; + uint64_t x3121 = y1r2 & (uint64_t)0x8888888888888888U; + uint64_t y021 = uu____29 & (uint64_t)0x1111111111111111U; + uint64_t y1121 = uu____29 & (uint64_t)0x2222222222222222U; + uint64_t y2121 = uu____29 & (uint64_t)0x4444444444444444U; + uint64_t y3221 = uu____29 & (uint64_t)0x8888888888888888U; + uint64_t z0117 = x021 * y021 ^ (x1121 * y3221 ^ (x2121 * y2121 ^ x3121 * y1121)); + uint64_t z131 = x021 * y1121 ^ (x1121 * y021 ^ (x2121 * y3221 ^ x3121 * y2121)); + uint64_t z230 = x021 * y2121 ^ (x1121 * y1121 ^ (x2121 * y021 ^ x3121 * y3221)); + uint64_t z323 = x021 * y3221 ^ (x1121 * y2121 ^ (x2121 * y1121 ^ x3121 * y021)); + uint64_t + z0h3 = + (((z0117 & (uint64_t)0x1111111111111111U) | (z131 & (uint64_t)0x2222222222222222U)) + | (z230 & (uint64_t)0x4444444444444444U)) + | (z323 & (uint64_t)0x8888888888888888U); + uint64_t x022 = y2r2 & (uint64_t)0x1111111111111111U; + uint64_t x1122 = y2r2 & (uint64_t)0x2222222222222222U; + uint64_t x2122 = y2r2 & (uint64_t)0x4444444444444444U; + uint64_t x3122 = y2r2 & (uint64_t)0x8888888888888888U; + uint64_t y022 = uu____30 & (uint64_t)0x1111111111111111U; + uint64_t y1122 = uu____30 & (uint64_t)0x2222222222222222U; + uint64_t y2122 = uu____30 & (uint64_t)0x4444444444444444U; + uint64_t y3222 = uu____30 & (uint64_t)0x8888888888888888U; + uint64_t z0118 = x022 * y022 ^ (x1122 * y3222 ^ (x2122 * y2122 ^ x3122 * y1122)); + uint64_t z132 = x022 * y1122 ^ (x1122 * y022 ^ (x2122 * y3222 ^ x3122 * y2122)); + uint64_t z231 = x022 * y2122 ^ (x1122 * y1122 ^ (x2122 * y022 ^ x3122 * y3222)); + uint64_t z324 = x022 * y3222 ^ (x1122 * y2122 ^ (x2122 * y1122 ^ x3122 * y022)); + uint64_t + z1h3 = + (((z0118 & (uint64_t)0x1111111111111111U) | (z132 & (uint64_t)0x2222222222222222U)) + | (z231 & (uint64_t)0x4444444444444444U)) + | (z324 & (uint64_t)0x8888888888888888U); + uint64_t x0 = y3r & (uint64_t)0x1111111111111111U; + uint64_t x11 = y3r & (uint64_t)0x2222222222222222U; + uint64_t x21 = y3r & (uint64_t)0x4444444444444444U; + uint64_t x31 = y3r & (uint64_t)0x8888888888888888U; + uint64_t y0 = uu____31 & (uint64_t)0x1111111111111111U; + uint64_t y11 = uu____31 & (uint64_t)0x2222222222222222U; + uint64_t y21 = uu____31 & (uint64_t)0x4444444444444444U; + uint64_t y32 = uu____31 & (uint64_t)0x8888888888888888U; + uint64_t z01 = x0 * y0 ^ (x11 * y32 ^ (x21 * y21 ^ x31 * y11)); + uint64_t z13 = x0 * y11 ^ (x11 * y0 ^ (x21 * y32 ^ x31 * y21)); + uint64_t z232 = x0 * y21 ^ (x11 * y11 ^ (x21 * y0 ^ x31 * y32)); + uint64_t z325 = x0 * y32 ^ (x11 * y21 ^ (x21 * y11 ^ x31 * y0)); + uint64_t + z2h4 = + (((z01 & (uint64_t)0x1111111111111111U) | (z13 & (uint64_t)0x2222222222222222U)) + | (z232 & (uint64_t)0x4444444444444444U)) + | (z325 & (uint64_t)0x8888888888888888U); + uint64_t z23 = z227 ^ (z07 ^ z126); + uint64_t z2h1 = z2h4 ^ (z0h3 ^ z1h3); + uint64_t + x517 = + (z0h3 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z0h3 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x616 = + (x517 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x517 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x716 = + (x616 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x616 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x816 = + (x716 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x716 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x916 = + (x816 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x816 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z0h12 = (x916 << (uint32_t)32U | x916 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x518 = + (z1h3 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z1h3 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x617 = + (x518 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x518 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x717 = + (x617 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x617 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x817 = + (x717 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x717 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x917 = + (x817 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x817 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z1h12 = (x917 << (uint32_t)32U | x917 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t + x5 = + (z2h1 & (uint64_t)0x5555555555555555U) + << (uint32_t)(uint8_t)1U + | (z2h1 >> (uint32_t)(uint8_t)1U & (uint64_t)0x5555555555555555U); + uint64_t + x618 = + (x5 & (uint64_t)0x3333333333333333U) + << (uint32_t)(uint8_t)2U + | (x5 >> (uint32_t)(uint8_t)2U & (uint64_t)0x3333333333333333U); + uint64_t + x718 = + (x618 & (uint64_t)0x0F0F0F0F0F0F0F0FU) + << (uint32_t)(uint8_t)4U + | (x618 >> (uint32_t)(uint8_t)4U & (uint64_t)0x0F0F0F0F0F0F0F0FU); + uint64_t + x818 = + (x718 & (uint64_t)0x00FF00FF00FF00FFU) + << (uint32_t)(uint8_t)8U + | (x718 >> (uint32_t)(uint8_t)8U & (uint64_t)0x00FF00FF00FF00FFU); + uint64_t + x918 = + (x818 & (uint64_t)0x0000FFFF0000FFFFU) + << (uint32_t)(uint8_t)16U + | (x818 >> (uint32_t)(uint8_t)16U & (uint64_t)0x0000FFFF0000FFFFU); + uint64_t z2h22 = (x918 << (uint32_t)32U | x918 >> (uint32_t)32U) >> (uint32_t)1U; + uint64_t z4_1 = z07; + uint64_t z4_2 = z0h12 ^ z23; + uint64_t z4_3 = z126 ^ z2h22; + uint64_t z4_4 = z1h12; + uint64_t z12 = z11 ^ z4_1; + uint64_t z22 = z21 ^ z4_2; + uint64_t z32 = z31 ^ z4_3; + uint64_t z42 = z41 ^ z4_4; + uint64_t v3 = z42 << (uint32_t)1U | z32 >> (uint32_t)63U; + uint64_t v20 = z32 << (uint32_t)1U | z22 >> (uint32_t)63U; + uint64_t v1 = z22 << (uint32_t)1U | z12 >> (uint32_t)63U; + uint64_t v0 = z12 << (uint32_t)1U; + uint64_t v21 = v20 ^ (v0 ^ (v0 >> (uint32_t)1U ^ (v0 >> (uint32_t)2U ^ v0 >> (uint32_t)7U))); + uint64_t v11 = v1 ^ (v0 << (uint32_t)63U ^ (v0 << (uint32_t)62U ^ v0 << (uint32_t)57U)); + uint64_t + v31 = v3 ^ (v11 ^ (v11 >> (uint32_t)1U ^ (v11 >> (uint32_t)2U ^ v11 >> (uint32_t)7U))); + uint64_t v22 = v21 ^ (v11 << (uint32_t)63U ^ (v11 << (uint32_t)62U ^ v11 << (uint32_t)57U)); + uint64_t v10 = v22; + uint64_t v2 = v31; + acc[0U] = v10; + acc[1U] = v2; +} + +void Hacl_Gf128_CT64_gcm_init(uint64_t *ctx, uint8_t *key) +{ + uint64_t *acc = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + acc[0U] = (uint64_t)0U; + acc[1U] = (uint64_t)0U; + load_precompute_r(pre, key); +} + +void Hacl_Gf128_CT64_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text) +{ + uint64_t *acc = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + uint64_t f[8U] = { 0U }; + uint64_t *b4 = f; + uint32_t nb = len0 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t0 + i * (uint32_t)64U; + uint64_t *x0 = b4; + uint8_t *y0 = tb; + uint64_t *x1 = b4 + (uint32_t)2U; + uint8_t *y1 = tb + (uint32_t)16U; + uint64_t *x2 = b4 + (uint32_t)4U; + uint8_t *y2 = tb + (uint32_t)32U; + uint64_t *x3 = b4 + (uint32_t)6U; + uint8_t *y3 = tb + (uint32_t)48U; + uint64_t u = load64_be(y0); + x0[1U] = u; + uint64_t u0 = load64_be(y0 + (uint32_t)8U); + x0[0U] = u0; + uint64_t u1 = load64_be(y1); + x1[1U] = u1; + uint64_t u2 = load64_be(y1 + (uint32_t)8U); + x1[0U] = u2; + uint64_t u3 = load64_be(y2); + x2[1U] = u3; + uint64_t u4 = load64_be(y2 + (uint32_t)8U); + x2[0U] = u4; + uint64_t u5 = load64_be(y3); + x3[1U] = u5; + uint64_t u6 = load64_be(y3 + (uint32_t)8U); + x3[0U] = u6; + uint64_t *uu____0 = b4; + uu____0[0U] = uu____0[0U] ^ acc[0U]; + uu____0[1U] = uu____0[1U] ^ acc[1U]; + normalize4(acc, b4, pre); + } + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + uint64_t *r1 = pre + (uint32_t)6U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint64_t u = load64_be(tb); + elem[1U] = u; + uint64_t u0 = load64_be(tb + (uint32_t)8U); + elem[0U] = u0; + acc[0U] = acc[0U] ^ elem[0U]; + acc[1U] = acc[1U] ^ elem[1U]; + fmul0(acc, r1); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + uint64_t u = load64_be(b); + elem[1U] = u; + uint64_t u0 = load64_be(b + (uint32_t)8U); + elem[0U] = u0; + acc[0U] = acc[0U] ^ elem[0U]; + acc[1U] = acc[1U] ^ elem[1U]; + fmul0(acc, r1); + return; + } +} + +void +(*Hacl_Gf128_CT64_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2) = + Hacl_Gf128_CT64_gcm_update_blocks; + +void Hacl_Gf128_CT64_gcm_emit(uint8_t *tag, uint64_t *ctx) +{ + uint64_t *acc = ctx; + uint64_t r0 = acc[1U]; + uint64_t r1 = acc[0U]; + store64_be(tag, r0); + store64_be(tag + (uint32_t)8U, r1); +} + +void Hacl_Gf128_CT64_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key) +{ + uint64_t ctx[18U] = { 0U }; + uint64_t *acc = ctx; + uint64_t *pre0 = ctx + (uint32_t)2U; + acc[0U] = (uint64_t)0U; + acc[1U] = (uint64_t)0U; + load_precompute_r(pre0, key); + uint64_t *acc0 = ctx; + uint64_t *pre = ctx + (uint32_t)2U; + uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; + uint8_t *t0 = text; + if (len0 > (uint32_t)0U) + { + uint64_t f[8U] = { 0U }; + uint64_t *b4 = f; + uint32_t nb = len0 / (uint32_t)64U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t0 + i * (uint32_t)64U; + uint64_t *x0 = b4; + uint8_t *y0 = tb; + uint64_t *x1 = b4 + (uint32_t)2U; + uint8_t *y1 = tb + (uint32_t)16U; + uint64_t *x2 = b4 + (uint32_t)4U; + uint8_t *y2 = tb + (uint32_t)32U; + uint64_t *x3 = b4 + (uint32_t)6U; + uint8_t *y3 = tb + (uint32_t)48U; + uint64_t u = load64_be(y0); + x0[1U] = u; + uint64_t u0 = load64_be(y0 + (uint32_t)8U); + x0[0U] = u0; + uint64_t u1 = load64_be(y1); + x1[1U] = u1; + uint64_t u2 = load64_be(y1 + (uint32_t)8U); + x1[0U] = u2; + uint64_t u3 = load64_be(y2); + x2[1U] = u3; + uint64_t u4 = load64_be(y2 + (uint32_t)8U); + x2[0U] = u4; + uint64_t u5 = load64_be(y3); + x3[1U] = u5; + uint64_t u6 = load64_be(y3 + (uint32_t)8U); + x3[0U] = u6; + uint64_t *uu____0 = b4; + uu____0[0U] = uu____0[0U] ^ acc0[0U]; + uu____0[1U] = uu____0[1U] ^ acc0[1U]; + normalize4(acc0, b4, pre); + } + } + uint32_t len1 = len - len0; + uint8_t *t1 = text + len0; + uint64_t *r10 = pre + (uint32_t)6U; + uint32_t nb = len1 / (uint32_t)16U; + uint32_t rem = len1 % (uint32_t)16U; + for (uint32_t i = (uint32_t)0U; i < nb; i++) + { + uint8_t *tb = t1 + i * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint64_t u = load64_be(tb); + elem[1U] = u; + uint64_t u0 = load64_be(tb + (uint32_t)8U); + elem[0U] = u0; + acc0[0U] = acc0[0U] ^ elem[0U]; + acc0[1U] = acc0[1U] ^ elem[1U]; + fmul0(acc0, r10); + } + if (rem > (uint32_t)0U) + { + uint8_t *last = t1 + nb * (uint32_t)16U; + uint64_t elem[2U] = { 0U }; + uint8_t b[16U] = { 0U }; + memcpy(b, last, rem * sizeof (uint8_t)); + uint64_t u = load64_be(b); + elem[1U] = u; + uint64_t u0 = load64_be(b + (uint32_t)8U); + elem[0U] = u0; + acc0[0U] = acc0[0U] ^ elem[0U]; + acc0[1U] = acc0[1U] ^ elem[1U]; + fmul0(acc0, r10); + } + uint64_t *acc1 = ctx; + uint64_t r0 = acc1[1U]; + uint64_t r1 = acc1[0U]; + store64_be(tag, r0); + store64_be(tag + (uint32_t)8U, r1); +} + diff --git a/src/msvc/Hacl_Gf128_PreComp.c b/src/msvc/Hacl_Gf128_PreComp.c deleted file mode 100644 index fa12b870..00000000 --- a/src/msvc/Hacl_Gf128_PreComp.c +++ /dev/null @@ -1,461 +0,0 @@ -/* MIT License - * - * Copyright (c) 2016-2022 INRIA, CMU and Microsoft Corporation - * Copyright (c) 2022-2023 HACL* Contributors - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell - * copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - - -#include "Hacl_Gf128_PreComp.h" - -void Hacl_Impl_Gf128_FieldPreComp_fmul(uint64_t *x, uint64_t *y) -{ - uint64_t res[2U] = { 0U }; - uint64_t y_[2U] = { 0U }; - y_[0U] = y[0U]; - y_[1U] = y[1U]; - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - uint64_t m = (uint64_t)0U - (x[1U] >> ((uint32_t)63U - i) & (uint64_t)1U); - res[0U] = res[0U] ^ (y_[0U] & m); - res[1U] = res[1U] ^ (y_[1U] & m); - uint64_t m0 = (uint64_t)0U - (y_[0U] & (uint64_t)1U); - y_[0U] = y_[0U] >> (uint32_t)1U | y_[1U] << (uint32_t)63U; - y_[1U] = y_[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m0); - } - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - uint64_t m = (uint64_t)0U - (x[0U] >> ((uint32_t)63U - i) & (uint64_t)1U); - res[0U] = res[0U] ^ (y_[0U] & m); - res[1U] = res[1U] ^ (y_[1U] & m); - uint64_t m0 = (uint64_t)0U - (y_[0U] & (uint64_t)1U); - y_[0U] = y_[0U] >> (uint32_t)1U | y_[1U] << (uint32_t)63U; - y_[1U] = y_[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m0); - } - x[0U] = res[0U]; - x[1U] = res[1U]; -} - -static inline void prepare(uint64_t *pre, uint64_t *r) -{ - memset(pre, 0U, (uint32_t)256U * sizeof (uint64_t)); - uint64_t sh[2U] = { 0U }; - sh[0U] = r[0U]; - sh[1U] = r[1U]; - uint64_t *pre1 = pre; - uint64_t *pre2 = pre + (uint32_t)128U; - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - memcpy(pre1 + (uint32_t)2U * i, sh, (uint32_t)2U * sizeof (uint64_t)); - uint64_t m = (uint64_t)0U - (sh[0U] & (uint64_t)1U); - sh[0U] = sh[0U] >> (uint32_t)1U | sh[1U] << (uint32_t)63U; - sh[1U] = sh[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m); - } - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - memcpy(pre2 + (uint32_t)2U * i, sh, (uint32_t)2U * sizeof (uint64_t)); - uint64_t m = (uint64_t)0U - (sh[0U] & (uint64_t)1U); - sh[0U] = sh[0U] >> (uint32_t)1U | sh[1U] << (uint32_t)63U; - sh[1U] = sh[1U] >> (uint32_t)1U ^ ((uint64_t)0xE100000000000000U & m); - } -} - -void Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(uint64_t *pre, uint8_t *key) -{ - uint64_t *r4321 = pre; - uint64_t *r1 = r4321 + (uint32_t)6U; - uint64_t *r2 = r4321 + (uint32_t)4U; - uint64_t *r3 = r4321 + (uint32_t)2U; - uint64_t *r4 = r4321; - uint64_t *table2 = pre + (uint32_t)8U; - uint64_t u = load64_be(key); - r1[1U] = u; - uint64_t u0 = load64_be(key + (uint32_t)8U); - r1[0U] = u0; - r4[0U] = r1[0U]; - r4[1U] = r1[1U]; - r3[0U] = r1[0U]; - r3[1U] = r1[1U]; - r2[0U] = r1[0U]; - r2[1U] = r1[1U]; - Hacl_Impl_Gf128_FieldPreComp_fmul(r2, r1); - Hacl_Impl_Gf128_FieldPreComp_fmul(r3, r2); - Hacl_Impl_Gf128_FieldPreComp_fmul(r4, r3); - prepare(table2, r4); -} - -static inline void fmul_pre(uint64_t *x, uint64_t *pre) -{ - uint64_t *tab = pre + (uint32_t)8U; - uint64_t tmp[2U] = { 0U }; - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - uint64_t *uu____0 = tab + (uint32_t)2U * i; - uint64_t m = (uint64_t)0U - (x[1U] >> ((uint32_t)63U - i) & (uint64_t)1U); - tmp[0U] = tmp[0U] ^ (uu____0[0U] & m); - tmp[1U] = tmp[1U] ^ (uu____0[1U] & m); - } - for (uint32_t i = (uint32_t)0U; i < (uint32_t)64U; i++) - { - uint64_t *uu____1 = tab + (uint32_t)128U + (uint32_t)2U * i; - uint64_t m = (uint64_t)0U - (x[0U] >> ((uint32_t)63U - i) & (uint64_t)1U); - tmp[0U] = tmp[0U] ^ (uu____1[0U] & m); - tmp[1U] = tmp[1U] ^ (uu____1[1U] & m); - } - x[0U] = tmp[0U]; - x[1U] = tmp[1U]; -} - -void Hacl_Impl_Gf128_FieldPreComp_fmul_r4(uint64_t *x, uint64_t *pre) -{ - fmul_pre(x, pre); - fmul_pre(x + (uint32_t)2U, pre); - fmul_pre(x + (uint32_t)4U, pre); - fmul_pre(x + (uint32_t)6U, pre); -} - -void Hacl_Impl_Gf128_FieldPreComp_normalize4(uint64_t *acc, uint64_t *x, uint64_t *pre) -{ - uint64_t *x1 = x; - uint64_t *x2 = x + (uint32_t)2U; - uint64_t *x3 = x + (uint32_t)4U; - uint64_t *x4 = x + (uint32_t)6U; - fmul_pre(x, pre); - Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)2U, pre + (uint32_t)2U); - Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)4U, pre + (uint32_t)4U); - Hacl_Impl_Gf128_FieldPreComp_fmul(x + (uint32_t)6U, pre + (uint32_t)6U); - acc[0U] = x1[0U]; - acc[1U] = x1[1U]; - acc[0U] = acc[0U] ^ x2[0U]; - acc[1U] = acc[1U] ^ x2[1U]; - acc[0U] = acc[0U] ^ x3[0U]; - acc[1U] = acc[1U] ^ x3[1U]; - acc[0U] = acc[0U] ^ x4[0U]; - acc[1U] = acc[1U] ^ x4[1U]; -} - -void Hacl_Gf128_PreComp_gcm_init(uint64_t *ctx, uint8_t *key) -{ - uint64_t *acc = ctx; - uint64_t *pre = ctx + (uint32_t)2U; - acc[0U] = (uint64_t)0U; - acc[1U] = (uint64_t)0U; - Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(pre, key); -} - -void Hacl_Gf128_PreComp_gcm_update_blocks(uint64_t *ctx, uint32_t len, uint8_t *text) -{ - uint64_t *acc = ctx; - uint64_t *pre = ctx + (uint32_t)2U; - uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; - uint8_t *t0 = text; - if (len0 > (uint32_t)0U) - { - uint64_t f0[8U] = { 0U }; - uint64_t *b4 = f0; - uint64_t f[8U] = { 0U }; - uint64_t *acc4 = f; - uint8_t *tb = t0; - memcpy(acc4, acc, (uint32_t)2U * sizeof (uint64_t)); - uint64_t *x00 = b4; - uint8_t *y00 = tb; - uint64_t *x10 = b4 + (uint32_t)2U; - uint8_t *y10 = tb + (uint32_t)16U; - uint64_t *x20 = b4 + (uint32_t)4U; - uint8_t *y20 = tb + (uint32_t)32U; - uint64_t *x30 = b4 + (uint32_t)6U; - uint8_t *y30 = tb + (uint32_t)48U; - uint64_t u0 = load64_be(y00); - x00[1U] = u0; - uint64_t u1 = load64_be(y00 + (uint32_t)8U); - x00[0U] = u1; - uint64_t u2 = load64_be(y10); - x10[1U] = u2; - uint64_t u3 = load64_be(y10 + (uint32_t)8U); - x10[0U] = u3; - uint64_t u4 = load64_be(y20); - x20[1U] = u4; - uint64_t u5 = load64_be(y20 + (uint32_t)8U); - x20[0U] = u5; - uint64_t u6 = load64_be(y30); - x30[1U] = u6; - uint64_t u7 = load64_be(y30 + (uint32_t)8U); - x30[0U] = u7; - uint64_t *x01 = acc4; - uint64_t *y01 = b4; - uint64_t *x11 = acc4 + (uint32_t)2U; - uint64_t *y11 = b4 + (uint32_t)2U; - uint64_t *x21 = acc4 + (uint32_t)4U; - uint64_t *y21 = b4 + (uint32_t)4U; - uint64_t *x31 = acc4 + (uint32_t)6U; - uint64_t *y31 = b4 + (uint32_t)6U; - x01[0U] = x01[0U] ^ y01[0U]; - x01[1U] = x01[1U] ^ y01[1U]; - x11[0U] = x11[0U] ^ y11[0U]; - x11[1U] = x11[1U] ^ y11[1U]; - x21[0U] = x21[0U] ^ y21[0U]; - x21[1U] = x21[1U] ^ y21[1U]; - x31[0U] = x31[0U] ^ y31[0U]; - x31[1U] = x31[1U] ^ y31[1U]; - uint32_t len1 = len0 - (uint32_t)64U; - uint8_t *text1 = t0 + (uint32_t)64U; - uint32_t nb = len1 / (uint32_t)64U; - for (uint32_t i = (uint32_t)0U; i < nb; i++) - { - uint8_t *tb1 = text1 + i * (uint32_t)64U; - uint64_t *x0 = b4; - uint8_t *y02 = tb1; - uint64_t *x12 = b4 + (uint32_t)2U; - uint8_t *y12 = tb1 + (uint32_t)16U; - uint64_t *x22 = b4 + (uint32_t)4U; - uint8_t *y22 = tb1 + (uint32_t)32U; - uint64_t *x32 = b4 + (uint32_t)6U; - uint8_t *y32 = tb1 + (uint32_t)48U; - uint64_t u = load64_be(y02); - x0[1U] = u; - uint64_t u8 = load64_be(y02 + (uint32_t)8U); - x0[0U] = u8; - uint64_t u9 = load64_be(y12); - x12[1U] = u9; - uint64_t u10 = load64_be(y12 + (uint32_t)8U); - x12[0U] = u10; - uint64_t u11 = load64_be(y22); - x22[1U] = u11; - uint64_t u12 = load64_be(y22 + (uint32_t)8U); - x22[0U] = u12; - uint64_t u13 = load64_be(y32); - x32[1U] = u13; - uint64_t u14 = load64_be(y32 + (uint32_t)8U); - x32[0U] = u14; - Hacl_Impl_Gf128_FieldPreComp_fmul_r4(acc4, pre); - uint64_t *x02 = acc4; - uint64_t *y0 = b4; - uint64_t *x1 = acc4 + (uint32_t)2U; - uint64_t *y1 = b4 + (uint32_t)2U; - uint64_t *x2 = acc4 + (uint32_t)4U; - uint64_t *y2 = b4 + (uint32_t)4U; - uint64_t *x3 = acc4 + (uint32_t)6U; - uint64_t *y3 = b4 + (uint32_t)6U; - x02[0U] = x02[0U] ^ y0[0U]; - x02[1U] = x02[1U] ^ y0[1U]; - x1[0U] = x1[0U] ^ y1[0U]; - x1[1U] = x1[1U] ^ y1[1U]; - x2[0U] = x2[0U] ^ y2[0U]; - x2[1U] = x2[1U] ^ y2[1U]; - x3[0U] = x3[0U] ^ y3[0U]; - x3[1U] = x3[1U] ^ y3[1U]; - } - Hacl_Impl_Gf128_FieldPreComp_normalize4(acc, acc4, pre); - } - uint32_t len1 = len - len0; - uint8_t *t1 = text + len0; - uint64_t *r1 = pre + (uint32_t)6U; - uint32_t nb = len1 / (uint32_t)16U; - uint32_t rem = len1 % (uint32_t)16U; - for (uint32_t i = (uint32_t)0U; i < nb; i++) - { - uint8_t *tb = t1 + i * (uint32_t)16U; - uint64_t elem[2U] = { 0U }; - uint64_t u = load64_be(tb); - elem[1U] = u; - uint64_t u0 = load64_be(tb + (uint32_t)8U); - elem[0U] = u0; - acc[0U] = acc[0U] ^ elem[0U]; - acc[1U] = acc[1U] ^ elem[1U]; - Hacl_Impl_Gf128_FieldPreComp_fmul(acc, r1); - } - if (rem > (uint32_t)0U) - { - uint8_t *last = t1 + nb * (uint32_t)16U; - uint64_t elem[2U] = { 0U }; - uint8_t b[16U] = { 0U }; - memcpy(b, last, rem * sizeof (uint8_t)); - uint64_t u = load64_be(b); - elem[1U] = u; - uint64_t u0 = load64_be(b + (uint32_t)8U); - elem[0U] = u0; - acc[0U] = acc[0U] ^ elem[0U]; - acc[1U] = acc[1U] ^ elem[1U]; - Hacl_Impl_Gf128_FieldPreComp_fmul(acc, r1); - return; - } -} - -void -(*Hacl_Gf128_PreComp_gcm_update_blocks_padded)(uint64_t *x0, uint32_t x1, uint8_t *x2) = - Hacl_Gf128_PreComp_gcm_update_blocks; - -void Hacl_Gf128_PreComp_gcm_emit(uint8_t *tag, uint64_t *ctx) -{ - uint64_t *acc = ctx; - uint64_t r0 = acc[1U]; - uint64_t r1 = acc[0U]; - store64_be(tag, r0); - store64_be(tag + (uint32_t)8U, r1); -} - -void Hacl_Gf128_PreComp_ghash(uint8_t *tag, uint32_t len, uint8_t *text, uint8_t *key) -{ - uint64_t ctx[266U] = { 0U }; - uint64_t *acc = ctx; - uint64_t *pre0 = ctx + (uint32_t)2U; - acc[0U] = (uint64_t)0U; - acc[1U] = (uint64_t)0U; - Hacl_Impl_Gf128_FieldPreComp_load_precompute_r(pre0, key); - uint64_t *acc0 = ctx; - uint64_t *pre = ctx + (uint32_t)2U; - uint32_t len0 = len / (uint32_t)64U * (uint32_t)64U; - uint8_t *t0 = text; - if (len0 > (uint32_t)0U) - { - uint64_t f0[8U] = { 0U }; - uint64_t *b4 = f0; - uint64_t f[8U] = { 0U }; - uint64_t *acc4 = f; - uint8_t *tb = t0; - memcpy(acc4, acc0, (uint32_t)2U * sizeof (uint64_t)); - uint64_t *x00 = b4; - uint8_t *y00 = tb; - uint64_t *x10 = b4 + (uint32_t)2U; - uint8_t *y10 = tb + (uint32_t)16U; - uint64_t *x20 = b4 + (uint32_t)4U; - uint8_t *y20 = tb + (uint32_t)32U; - uint64_t *x30 = b4 + (uint32_t)6U; - uint8_t *y30 = tb + (uint32_t)48U; - uint64_t u0 = load64_be(y00); - x00[1U] = u0; - uint64_t u1 = load64_be(y00 + (uint32_t)8U); - x00[0U] = u1; - uint64_t u2 = load64_be(y10); - x10[1U] = u2; - uint64_t u3 = load64_be(y10 + (uint32_t)8U); - x10[0U] = u3; - uint64_t u4 = load64_be(y20); - x20[1U] = u4; - uint64_t u5 = load64_be(y20 + (uint32_t)8U); - x20[0U] = u5; - uint64_t u6 = load64_be(y30); - x30[1U] = u6; - uint64_t u7 = load64_be(y30 + (uint32_t)8U); - x30[0U] = u7; - uint64_t *x01 = acc4; - uint64_t *y01 = b4; - uint64_t *x11 = acc4 + (uint32_t)2U; - uint64_t *y11 = b4 + (uint32_t)2U; - uint64_t *x21 = acc4 + (uint32_t)4U; - uint64_t *y21 = b4 + (uint32_t)4U; - uint64_t *x31 = acc4 + (uint32_t)6U; - uint64_t *y31 = b4 + (uint32_t)6U; - x01[0U] = x01[0U] ^ y01[0U]; - x01[1U] = x01[1U] ^ y01[1U]; - x11[0U] = x11[0U] ^ y11[0U]; - x11[1U] = x11[1U] ^ y11[1U]; - x21[0U] = x21[0U] ^ y21[0U]; - x21[1U] = x21[1U] ^ y21[1U]; - x31[0U] = x31[0U] ^ y31[0U]; - x31[1U] = x31[1U] ^ y31[1U]; - uint32_t len1 = len0 - (uint32_t)64U; - uint8_t *text1 = t0 + (uint32_t)64U; - uint32_t nb = len1 / (uint32_t)64U; - for (uint32_t i = (uint32_t)0U; i < nb; i++) - { - uint8_t *tb1 = text1 + i * (uint32_t)64U; - uint64_t *x0 = b4; - uint8_t *y02 = tb1; - uint64_t *x12 = b4 + (uint32_t)2U; - uint8_t *y12 = tb1 + (uint32_t)16U; - uint64_t *x22 = b4 + (uint32_t)4U; - uint8_t *y22 = tb1 + (uint32_t)32U; - uint64_t *x32 = b4 + (uint32_t)6U; - uint8_t *y32 = tb1 + (uint32_t)48U; - uint64_t u = load64_be(y02); - x0[1U] = u; - uint64_t u8 = load64_be(y02 + (uint32_t)8U); - x0[0U] = u8; - uint64_t u9 = load64_be(y12); - x12[1U] = u9; - uint64_t u10 = load64_be(y12 + (uint32_t)8U); - x12[0U] = u10; - uint64_t u11 = load64_be(y22); - x22[1U] = u11; - uint64_t u12 = load64_be(y22 + (uint32_t)8U); - x22[0U] = u12; - uint64_t u13 = load64_be(y32); - x32[1U] = u13; - uint64_t u14 = load64_be(y32 + (uint32_t)8U); - x32[0U] = u14; - Hacl_Impl_Gf128_FieldPreComp_fmul_r4(acc4, pre); - uint64_t *x02 = acc4; - uint64_t *y0 = b4; - uint64_t *x1 = acc4 + (uint32_t)2U; - uint64_t *y1 = b4 + (uint32_t)2U; - uint64_t *x2 = acc4 + (uint32_t)4U; - uint64_t *y2 = b4 + (uint32_t)4U; - uint64_t *x3 = acc4 + (uint32_t)6U; - uint64_t *y3 = b4 + (uint32_t)6U; - x02[0U] = x02[0U] ^ y0[0U]; - x02[1U] = x02[1U] ^ y0[1U]; - x1[0U] = x1[0U] ^ y1[0U]; - x1[1U] = x1[1U] ^ y1[1U]; - x2[0U] = x2[0U] ^ y2[0U]; - x2[1U] = x2[1U] ^ y2[1U]; - x3[0U] = x3[0U] ^ y3[0U]; - x3[1U] = x3[1U] ^ y3[1U]; - } - Hacl_Impl_Gf128_FieldPreComp_normalize4(acc0, acc4, pre); - } - uint32_t len1 = len - len0; - uint8_t *t1 = text + len0; - uint64_t *r10 = pre + (uint32_t)6U; - uint32_t nb = len1 / (uint32_t)16U; - uint32_t rem = len1 % (uint32_t)16U; - for (uint32_t i = (uint32_t)0U; i < nb; i++) - { - uint8_t *tb = t1 + i * (uint32_t)16U; - uint64_t elem[2U] = { 0U }; - uint64_t u = load64_be(tb); - elem[1U] = u; - uint64_t u0 = load64_be(tb + (uint32_t)8U); - elem[0U] = u0; - acc0[0U] = acc0[0U] ^ elem[0U]; - acc0[1U] = acc0[1U] ^ elem[1U]; - Hacl_Impl_Gf128_FieldPreComp_fmul(acc0, r10); - } - if (rem > (uint32_t)0U) - { - uint8_t *last = t1 + nb * (uint32_t)16U; - uint64_t elem[2U] = { 0U }; - uint8_t b[16U] = { 0U }; - memcpy(b, last, rem * sizeof (uint8_t)); - uint64_t u = load64_be(b); - elem[1U] = u; - uint64_t u0 = load64_be(b + (uint32_t)8U); - elem[0U] = u0; - acc0[0U] = acc0[0U] ^ elem[0U]; - acc0[1U] = acc0[1U] ^ elem[1U]; - Hacl_Impl_Gf128_FieldPreComp_fmul(acc0, r10); - } - uint64_t *acc1 = ctx; - uint64_t r0 = acc1[1U]; - uint64_t r1 = acc1[0U]; - store64_be(tag, r0); - store64_be(tag + (uint32_t)8U, r1); -} - diff --git a/tests/aead.cc b/tests/aead.cc index 38dcf19f..8b103f35 100644 --- a/tests/aead.cc +++ b/tests/aead.cc @@ -117,22 +117,8 @@ TEST_P(AesGcmSuite, KAT) } else { FAIL() << "Unexpected keySize."; } - - if (res != EverCrypt_Error_Success) { - if (!EverCrypt_AutoConfig2_has_aesni() || - !EverCrypt_AutoConfig2_has_pclmulqdq() || - !EverCrypt_AutoConfig2_has_avx() || !EverCrypt_AutoConfig2_has_sse() || - !EverCrypt_AutoConfig2_has_movbe()) { - cout << "Skipping failed `EverCrypt_AEAD_create_in(...)` due to missing " - "features." - << endl; - return; - } else { - FAIL() << "`EverCrypt_AEAD_create_in(...)` failed unexpectedly with " - "error code \"" - << res << "\"."; - } - } + // Should always work. + ASSERT_EQ(res, EverCrypt_Error_Success); encrypt_decrypt( state, test.iv, test.aad, test.msg, test.ct, test.tag, test.valid); From 720a2a779ea36eebe8a5533074cfa5a3c11c146c Mon Sep 17 00:00:00 2001 From: mamonet <66893036+mamonet@users.noreply.github.com> Date: Sat, 5 Aug 2023 20:05:37 +0300 Subject: [PATCH 07/10] Update aesgcm.cc --- benchmarks/aesgcm.cc | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/benchmarks/aesgcm.cc b/benchmarks/aesgcm.cc index 891b5ad8..79397f77 100644 --- a/benchmarks/aesgcm.cc +++ b/benchmarks/aesgcm.cc @@ -30,7 +30,7 @@ HACL_AES_128_GCM_NI_encrypt(benchmark::State& state) bytes ciphertext(state.range(0) + 16, 0); for (auto _ : state) { - Lib_IntVector_Intrinsics_vec128 *ctx = (Lib_IntVector_Intrinsics_vec128 *)KRML_HOST_CALLOC((uint32_t)352U, sizeof (uint8_t)); + Lib_IntVector_Intrinsics_vec128 *ctx = (Lib_IntVector_Intrinsics_vec128 *)KRML_HOST_CALLOC((uint32_t)288U, sizeof (uint8_t)); Hacl_AES_128_GCM_NI_aes128_gcm_init(ctx, key.data()); Hacl_AES_128_GCM_NI_aes128_gcm_encrypt(ctx, plaintext.size(), ciphertext.data(), plaintext.data(), 0, NULL, nonce.size(), nonce.data()); KRML_HOST_FREE(ctx); @@ -45,7 +45,7 @@ HACL_AES_128_GCM_NI_aad(benchmark::State& state) bytes aad(state.range(0), 0x37); for (auto _ : state) { - Lib_IntVector_Intrinsics_vec128 *ctx = (Lib_IntVector_Intrinsics_vec128 *)KRML_HOST_CALLOC((uint32_t)352U, sizeof (uint8_t)); + Lib_IntVector_Intrinsics_vec128 *ctx = (Lib_IntVector_Intrinsics_vec128 *)KRML_HOST_CALLOC((uint32_t)288U, sizeof (uint8_t)); Hacl_AES_128_GCM_NI_aes128_gcm_init(ctx, key.data()); Hacl_AES_128_GCM_NI_aes128_gcm_encrypt(ctx, 0, mac.data(), NULL, aad.size(), aad.data(), nonce.size(), nonce.data()); KRML_HOST_FREE(ctx); @@ -62,7 +62,7 @@ HACL_AES_128_GCM_CT64_encrypt(benchmark::State& state) bytes ciphertext(state.range(0) + 16, 0); for (auto _ : state) { - uint64_t *ctx = (uint64_t *)KRML_HOST_CALLOC((uint32_t)3168U, sizeof (uint8_t)); + uint64_t *ctx = (uint64_t *)KRML_HOST_CALLOC((uint32_t)928U, sizeof (uint8_t)); Hacl_AES_128_GCM_CT64_aes128_gcm_init(ctx, key.data()); Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt(ctx, plaintext.size(), ciphertext.data(), plaintext.data(), 0, NULL, nonce.size(), nonce.data()); KRML_HOST_FREE(ctx); @@ -77,7 +77,7 @@ HACL_AES_128_GCM_CT64_aad(benchmark::State& state) bytes aad(state.range(0), 0x37); for (auto _ : state) { - uint64_t *ctx = (uint64_t *)KRML_HOST_CALLOC((uint32_t)3168U, sizeof (uint8_t)); + uint64_t *ctx = (uint64_t *)KRML_HOST_CALLOC((uint32_t)928U, sizeof (uint8_t)); Hacl_AES_128_GCM_CT64_aes128_gcm_init(ctx, key.data()); Hacl_AES_128_GCM_CT64_aes128_gcm_encrypt(ctx, 0, mac.data(), NULL, aad.size(), aad.data(), nonce.size(), nonce.data()); KRML_HOST_FREE(ctx); From 4b5f3eda6c7236bd33bf819db805547fd80b4487 Mon Sep 17 00:00:00 2001 From: mamonet <66893036+mamonet@users.noreply.github.com> Date: Wed, 9 Aug 2023 14:57:18 +0300 Subject: [PATCH 08/10] Support Windows in CPU feature detection --- cpu-features/src/cpu-features.c | 53 +++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 16 deletions(-) diff --git a/cpu-features/src/cpu-features.c b/cpu-features/src/cpu-features.c index 367494fc..97559c86 100644 --- a/cpu-features/src/cpu-features.c +++ b/cpu-features/src/cpu-features.c @@ -18,7 +18,7 @@ #elif defined(__arm64__) || defined(__arm64) || defined(__aarch64__) #define CPU_FEATURES_ARM64 #elif defined(__s390x__) -#define CPU_FEATURES_POWERZ +#define CPU_FEATURES_S390X #else #error "Unsupported CPU" #endif @@ -27,15 +27,23 @@ #include #include #define CPU_FEATURES_MACOS -#elif defined(__GNUC__) +#elif defined(__linux__) #define CPU_FEATURES_LINUX -#elif defined(_MSC_VER) +#elif defined(_WIN32) #define CPU_FEATURES_WINDOWS #else #error "Unsupported OS" #endif -#include +#if defined(CPU_FEATURES_WINDOWS) +#if defined(_MSC_VER) +#include +#define CPU_FEATURES_MSVC +#else +#define CPU_FEATURES_NON_MSVC +#endif +#endif + #if defined(CPU_FEATURES_LINUX) && defined(CPU_FEATURES_ARM64) && \ defined(__GLIBC__) && defined(__GLIBC_PREREQ) #if __GLIBC_PREREQ(2, 16) @@ -59,8 +67,8 @@ // === x86 | x64 -#if (defined(CPU_FEATURES_LINUX) || defined(CPU_FEATURES_MACOS)) && \ - defined(CPU_FEATURES_X64) && !defined(CPU_FEATURES_POWERZ) +#if (defined(CPU_FEATURES_LINUX) || defined(CPU_FEATURES_MACOS) || \ + defined(CPU_FEATURES_NON_MSVC)) && defined(CPU_FEATURES_X64) void cpuid(unsigned long leaf, unsigned long* eax, @@ -74,7 +82,8 @@ cpuid(unsigned long leaf, : "0"(leaf)); } -#elif defined(CPU_FEATURES_LINUX) && defined(CPU_FEATURES_X86) +#elif (defined(CPU_FEATURES_LINUX) || defined(CPU_FEATURES_NON_MSVC)) && \ + defined(CPU_FEATURES_X86) /* XXX: Find a 32-bit CPU to actually test this */ void cpuid(unsigned long leaf, @@ -150,7 +159,7 @@ hacl_vec128_support() return _sse && _sse2 && _sse3 && _sse41 && _sse41 && _cmov; #elif defined(CPU_FEATURES_ARM64) return _asimd; -#elif defined(CPU_FEATURES_POWERZ) +#elif defined(CPU_FEATURES_S390X) return 1; #else return 0; @@ -190,12 +199,27 @@ vale_sha2_support() void hacl_init_cpu_features() { - // TODO: Make this work for Windows. -#if (defined(CPU_FEATURES_X64) || defined(CPU_FEATURES_X86)) && \ - (defined(CPU_FEATURES_LINUX) || defined(CPU_FEATURES_MACOS)) +#if (defined(CPU_FEATURES_X64) || defined(CPU_FEATURES_X86)) && \ + (defined(CPU_FEATURES_LINUX) || defined(CPU_FEATURES_MACOS) || \ + defined(CPU_FEATURES_WINDOWS)) unsigned long eax, ebx, ecx, edx, eax_sub, ebx_sub, ecx_sub, edx_sub; +#if defined(CPU_FEATURES_MSVC) + int cpu_info[4]; + int cpu_info_sub[4]; + __cpuidex(cpu_info, 1, 0); + __cpuidex(cpu_info_sub, 7, 0); + eax = cpu_info[0]; + ebx = cpu_info[1]; + ecx = cpu_info[2]; + edx = cpu_info[3]; + eax_sub = cpu_info_sub[0]; + ebx_sub = cpu_info_sub[1]; + ecx_sub = cpu_info_sub[2]; + edx_sub = cpu_info_sub[3]; +#else cpuid(1, &eax, &ebx, &ecx, &edx); cpuid(7, &eax_sub, &ebx_sub, &ecx_sub, &edx_sub); +#endif _aes = (ecx & ECX_AESNI) != 0; _avx = (ecx & ECX_AVX) != 0; @@ -216,23 +240,20 @@ hacl_init_cpu_features() _ssse3 = (ecx & ECX_SSSE3) != 0; _sse41 = (ecx & ECX_SSE4_1) != 0; _sse42 = (ecx & ECX_SSE4_2) != 0; -#endif -#if defined(CPU_FEATURES_LINUX) && defined(CPU_FEATURES_ARM64) && \ +#elif defined(CPU_FEATURES_LINUX) && defined(CPU_FEATURES_ARM64) && \ defined(GETAUXVAL_FUNC) unsigned long hwcap = getauxval(AT_HWCAP); _asimd = ((hwcap & HWCAP_ASIMD) != 0) ? 1 : 0; _aes = ((hwcap & HWCAP_AES) != 0) ? 1 : 0; _pclmul = ((hwcap & HWCAP_PMULL) != 0) ? 1 : 0; _sha = ((hwcap & HWCAP_SHA2) != 0) ? 1 : 0; -#endif -#if defined(CPU_FEATURES_MACOS) && defined(CPU_FEATURES_ARM64) +#elif defined(CPU_FEATURES_MACOS) && defined(CPU_FEATURES_ARM64) int err; int64_t ret = 0; size_t size = sizeof(ret); - // Check for general support of Advanced SIMD instructions err = sysctlbyname("hw.optional.AdvSIMD", &ret, &size, NULL, 0); _asimd = (err == 0 && ret > 0) ? 1 : 0; From 4abdd02038ab979d04e62f83821cb29975a0c69e Mon Sep 17 00:00:00 2001 From: mamonet <66893036+mamonet@users.noreply.github.com> Date: Wed, 9 Aug 2023 14:58:36 +0300 Subject: [PATCH 09/10] Update cpu-features.c --- cpu-features/src/cpu-features.c | 1 + 1 file changed, 1 insertion(+) diff --git a/cpu-features/src/cpu-features.c b/cpu-features/src/cpu-features.c index 97559c86..bb899944 100644 --- a/cpu-features/src/cpu-features.c +++ b/cpu-features/src/cpu-features.c @@ -44,6 +44,7 @@ #endif #endif +#include #if defined(CPU_FEATURES_LINUX) && defined(CPU_FEATURES_ARM64) && \ defined(__GLIBC__) && defined(__GLIBC_PREREQ) #if __GLIBC_PREREQ(2, 16) From 943cad61e579afe819c9ab524bd9c8f341999b4b Mon Sep 17 00:00:00 2001 From: Maamoun TK Date: Mon, 14 Aug 2023 10:39:47 +0300 Subject: [PATCH 10/10] Remove not required casting in libintvector.h --- CMakeLists.txt | 2 +- include/libintvector.h | 2 +- include/msvc/libintvector.h | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index f0574b7f..e21d6970 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -442,7 +442,7 @@ install(DIRECTORY vale/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/vale # # Install config.h install(FILES build/config.h DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/hacl) -# CPU feature detection for tests +# CPU feature detection add_library(hacl_cpu_features OBJECT ${PROJECT_SOURCE_DIR}/cpu-features/src/cpu-features.c) target_include_directories(hacl_cpu_features PUBLIC ${PROJECT_SOURCE_DIR}/cpu-features/include) add_dependencies(hacl hacl_cpu_features) diff --git a/include/libintvector.h b/include/libintvector.h index 4c259dba..6faf6ed7 100644 --- a/include/libintvector.h +++ b/include/libintvector.h @@ -555,7 +555,7 @@ static inline Lib_IntVector_Intrinsics_vec128 Lib_IntVector_Intrinsics_ni_aes_ke (vextq_u64(x0,x0,x1)) #define Lib_IntVector_Intrinsics_vec128_shuffle32(x0, x1, x2, x3, x4) \ - ((uint32x4_t){((uint32x4_t)x0)[x1],((uint32x4_t)x0)[x2],((uint32x4_t)x0)[x3],((uint32x4_t)x0)[x4]}) + ((uint32x4_t){(x0)[x1],(x0)[x2],(x0)[x3],(x0)[x4]}) /* #define Lib_IntVector_Intrinsics_vec128_shuffle64(x0, x1, x2) \ diff --git a/include/msvc/libintvector.h b/include/msvc/libintvector.h index 4c259dba..6faf6ed7 100644 --- a/include/msvc/libintvector.h +++ b/include/msvc/libintvector.h @@ -555,7 +555,7 @@ static inline Lib_IntVector_Intrinsics_vec128 Lib_IntVector_Intrinsics_ni_aes_ke (vextq_u64(x0,x0,x1)) #define Lib_IntVector_Intrinsics_vec128_shuffle32(x0, x1, x2, x3, x4) \ - ((uint32x4_t){((uint32x4_t)x0)[x1],((uint32x4_t)x0)[x2],((uint32x4_t)x0)[x3],((uint32x4_t)x0)[x4]}) + ((uint32x4_t){(x0)[x1],(x0)[x2],(x0)[x3],(x0)[x4]}) /* #define Lib_IntVector_Intrinsics_vec128_shuffle64(x0, x1, x2) \