From 96fdfb8e62ccb6249b8c3e64b783dde29be13e46 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 16:21:31 +0100 Subject: [PATCH 01/19] Unroll aesce_encrypt_block Signed-off-by: Dave Rodgman --- library/aesce.c | 58 ++++++++++++++++++++++++++++++++++++++++++------- library/aesce.h | 3 +++ 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/library/aesce.c b/library/aesce.c index 4db8d2a1993e..abd47b1d8543 100644 --- a/library/aesce.c +++ b/library/aesce.c @@ -105,22 +105,64 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block, unsigned char *keys, int rounds) { - for (int i = 0; i < rounds - 1; i++) { - /* AES AddRoundKey, SubBytes, ShiftRows (in this order). - * AddRoundKey adds the round key for the previous round. */ - block = vaeseq_u8(block, vld1q_u8(keys + i * 16)); - /* AES mix columns */ - block = vaesmcq_u8(block); + /* Assume either 10, 12 or 14 rounds */ + if (rounds == 10) { + goto rounds_10; + } + if (rounds == 12) { + goto rounds_12; } + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; +rounds_12: + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; +rounds_10: + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; + block = vaeseq_u8(block, vld1q_u8(keys)); + block = vaesmcq_u8(block); + keys += 16; /* AES AddRoundKey for the previous round. * SubBytes, ShiftRows for the final round. */ - block = vaeseq_u8(block, vld1q_u8(keys + (rounds -1) * 16)); + block = vaeseq_u8(block, vld1q_u8(keys)); + keys += 16; /* Final round: no MixColumns */ /* Final AddRoundKey */ - block = veorq_u8(block, vld1q_u8(keys + rounds * 16)); + block = veorq_u8(block, vld1q_u8(keys)); return block; } diff --git a/library/aesce.h b/library/aesce.h index 7048d77c563d..61e73bfddd97 100644 --- a/library/aesce.h +++ b/library/aesce.h @@ -52,6 +52,9 @@ int mbedtls_aesce_has_support(void); /** * \brief Internal AES-ECB block encryption and decryption * + * Note: this assumes that the context specifies either 10, 12 or 14 rounds + * and will behave incorrectly if this is not the case. + * * \param ctx AES context * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT * \param input 16-byte input block From 1c4451d089e30be93382d0a3d5faf43db5f893ca Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 16:28:00 +0100 Subject: [PATCH 02/19] Unroll aesce_decrypt_block Signed-off-by: Dave Rodgman --- library/aesce.c | 84 ++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 65 insertions(+), 19 deletions(-) diff --git a/library/aesce.c b/library/aesce.c index abd47b1d8543..e21e3b39da35 100644 --- a/library/aesce.c +++ b/library/aesce.c @@ -171,31 +171,77 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block, unsigned char *keys, int rounds) { - - for (int i = 0; i < rounds - 1; i++) { - /* AES AddRoundKey, SubBytes, ShiftRows */ - block = vaesdq_u8(block, vld1q_u8(keys + i * 16)); - /* AES inverse MixColumns for the next round. - * - * This means that we switch the order of the inverse AddRoundKey and - * inverse MixColumns operations. We have to do this as AddRoundKey is - * done in an atomic instruction together with the inverses of SubBytes - * and ShiftRows. - * - * It works because MixColumns is a linear operation over GF(2^8) and - * AddRoundKey is an exclusive or, which is equivalent to addition over - * GF(2^8). (The inverse of MixColumns needs to be applied to the - * affected round keys separately which has been done when the - * decryption round keys were calculated.) */ - block = vaesimcq_u8(block); + /* Assume either 10, 12 or 14 rounds */ + if (rounds == 10) { + goto rounds_10; + } + if (rounds == 12) { + goto rounds_12; } + /* AES AddRoundKey, SubBytes, ShiftRows */ + block = vaesdq_u8(block, vld1q_u8(keys)); + /* AES inverse MixColumns for the next round. + * + * This means that we switch the order of the inverse AddRoundKey and + * inverse MixColumns operations. We have to do this as AddRoundKey is + * done in an atomic instruction together with the inverses of SubBytes + * and ShiftRows. + * + * It works because MixColumns is a linear operation over GF(2^8) and + * AddRoundKey is an exclusive or, which is equivalent to addition over + * GF(2^8). (The inverse of MixColumns needs to be applied to the + * affected round keys separately which has been done when the + * decryption round keys were calculated.) */ + block = vaesimcq_u8(block); + keys += 16; + + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; +rounds_12: + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; +rounds_10: + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; + block = vaesdq_u8(block, vld1q_u8(keys)); + block = vaesimcq_u8(block); + keys += 16; + /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the * last full round. */ - block = vaesdq_u8(block, vld1q_u8(keys + (rounds - 1) * 16)); + block = vaesdq_u8(block, vld1q_u8(keys)); + keys += 16; /* Inverse AddRoundKey for inverting the initial round key addition. */ - block = veorq_u8(block, vld1q_u8(keys + rounds * 16)); + block = veorq_u8(block, vld1q_u8(keys)); return block; } From 7fdfd70b19c323f18580b602bc924c37bb67da6d Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 18:42:25 +0100 Subject: [PATCH 03/19] Introduce MBEDTLS_COMPILER_IS_GCC Signed-off-by: Dave Rodgman --- library/bn_mul.h | 6 +----- library/common.h | 6 ++++++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/library/bn_mul.h b/library/bn_mul.h index c5994f704989..aec7f2db87dd 100644 --- a/library/bn_mul.h +++ b/library/bn_mul.h @@ -661,14 +661,10 @@ #if defined(__arm__) #if defined(__thumb__) && !defined(__thumb2__) -#if !defined(__ARMCC_VERSION) && !defined(__clang__) \ - && !defined(__llvm__) && !defined(__INTEL_COMPILER) +#if defined(MBEDTLS_COMPILER_IS_GCC) /* * Thumb 1 ISA. This code path has only been tested successfully on gcc; * it does not compile on clang or armclang. - * - * Other compilers which define __GNUC__ may not work. The above macro - * attempts to exclude these untested compilers. */ #if !defined(__OPTIMIZE__) && defined(__GNUC__) diff --git a/library/common.h b/library/common.h index b48a1fc66712..cfbff02ad29a 100644 --- a/library/common.h +++ b/library/common.h @@ -195,4 +195,10 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned #define MBEDTLS_UNLIKELY(x) x #endif +#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \ + && !defined(__llvm__) && !defined(__INTEL_COMPILER) +/* Defined if the compiler really is gcc and not clang, etc */ +#define MBEDTLS_COMPILER_IS_GCC +#endif + #endif /* MBEDTLS_LIBRARY_COMMON_H */ From b055f75c3d4222b00175a1eb5ea96e577acfa052 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 18:42:59 +0100 Subject: [PATCH 04/19] Introduce MBEDTLS_OPTIMIZE_ALWAYS Signed-off-by: Dave Rodgman --- library/common.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/library/common.h b/library/common.h index cfbff02ad29a..00c2cd360128 100644 --- a/library/common.h +++ b/library/common.h @@ -201,4 +201,11 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned #define MBEDTLS_COMPILER_IS_GCC #endif +/* If -Os is specified, override with -O2 for a given function */ +#if defined(MBEDTLS_COMPILER_IS_GCC) && defined(__OPTIMIZE_SIZE__) +#define MBEDTLS_OPTIMIZE_ALWAYS __attribute__((optimize("-O2"))) +#else +#define MBEDTLS_OPTIMIZE_ALWAYS +#endif + #endif /* MBEDTLS_LIBRARY_COMMON_H */ From 03bb526c24a189d8a91f0348eb036baf5491fdf9 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 18:43:24 +0100 Subject: [PATCH 05/19] Add a non-NEON variant of mbedtls_xor Signed-off-by: Dave Rodgman --- library/common.h | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/library/common.h b/library/common.h index 00c2cd360128..b56ad580c2e5 100644 --- a/library/common.h +++ b/library/common.h @@ -154,6 +154,42 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned } } +/** + * Perform a fast block XOR operation, such that + * r[i] = a[i] ^ b[i] where 0 <= i < n + * + * In some situations, this can perform better than mbedtls_xor (e.g., it's about 5% + * better in AES-CBC). + * + * \param r Pointer to result (buffer of at least \p n bytes). \p r + * may be equal to either \p a or \p b, but behaviour when + * it overlaps in other ways is undefined. + * \param a Pointer to input (buffer of at least \p n bytes) + * \param b Pointer to input (buffer of at least \p n bytes) + * \param n Number of bytes to process. + */ +static inline void mbedtls_xor_no_simd(unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n) +{ + size_t i = 0; +#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) +#if defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__) + /* This codepath probably only makes sense on architectures with 64-bit registers */ + for (; (i + 8) <= n; i += 8) { + uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i); + mbedtls_put_unaligned_uint64(r + i, x); + } +#else + for (; (i + 4) <= n; i += 4) { + uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i); + mbedtls_put_unaligned_uint32(r + i, x); + } +#endif +#endif + for (; i < n; i++) { + r[i] = a[i] ^ b[i]; + } +} + /* Fix MSVC C99 compatible issue * MSVC support __func__ from visual studio 2015( 1900 ) * Use MSVC predefine macro to avoid name check fail. From a0b166e11e8943378109e5167f2745453d7ea877 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 18:44:16 +0100 Subject: [PATCH 06/19] Use mbedtls_xor_no_simd from cmac and cbc Signed-off-by: Dave Rodgman --- library/aes.c | 24 +++++------------------- library/cmac.c | 4 ++-- 2 files changed, 7 insertions(+), 21 deletions(-) diff --git a/library/aes.c b/library/aes.c index 0a61d1b07046..d2687bcf3dd1 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1040,23 +1040,6 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx, #if defined(MBEDTLS_CIPHER_MODE_CBC) -#if defined(__ARM_NEON) && defined(__aarch64__) -/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on - * the result for the next block in CBC, and the cost of transferring that data from - * NEON registers, it is faster to use the following on aarch64. - * For 32-bit arm, NEON should be faster. */ -#define CBC_XOR_16(r, a, b) do { \ - mbedtls_put_unaligned_uint64(r, \ - mbedtls_get_unaligned_uint64(a) ^ \ - mbedtls_get_unaligned_uint64(b)); \ - mbedtls_put_unaligned_uint64(r + 8, \ - mbedtls_get_unaligned_uint64(a + 8) ^ \ - mbedtls_get_unaligned_uint64(b + 8)); \ -} while (0) -#else -#define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16) -#endif - /* * AES-CBC buffer encryption/decryption */ @@ -1099,7 +1082,10 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, if (ret != 0) { goto exit; } - CBC_XOR_16(output, output, iv); + /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on + * the result for the next block in CBC, and the cost of transferring that data from + * NEON registers, NEON is slower on aarch64. */ + mbedtls_xor_no_simd(output, output, iv, 16); memcpy(iv, temp, 16); @@ -1109,7 +1095,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, } } else { while (length > 0) { - CBC_XOR_16(output, input, ivp); + mbedtls_xor_no_simd(output, input, ivp, 16); ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output); if (ret != 0) { diff --git a/library/cmac.c b/library/cmac.c index 48f51df41d61..2f19d112975b 100644 --- a/library/cmac.c +++ b/library/cmac.c @@ -237,7 +237,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx, input, block_size - cmac_ctx->unprocessed_len); - mbedtls_xor(state, cmac_ctx->unprocessed_block, state, block_size); + mbedtls_xor_no_simd(state, cmac_ctx->unprocessed_block, state, block_size); if ((ret = mbedtls_cipher_update(ctx, state, block_size, state, &olen)) != 0) { @@ -255,7 +255,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx, /* Iterate across the input data in block sized chunks, excluding any * final partial or complete block */ for (j = 1; j < n; j++) { - mbedtls_xor(state, input, state, block_size); + mbedtls_xor_no_simd(state, input, state, block_size); if ((ret = mbedtls_cipher_update(ctx, state, block_size, state, &olen)) != 0) { From 6cfd9b54ae0d06451c1a46a10e57fa099878bb03 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 18:46:23 +0100 Subject: [PATCH 07/19] use MBEDTLS_OPTIMIZE_ALWAYS in AES-XTS Signed-off-by: Dave Rodgman --- library/aes.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/library/aes.c b/library/aes.c index d2687bcf3dd1..6ec4d78086bc 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1128,7 +1128,8 @@ typedef unsigned char mbedtls_be128[16]; * for machine endianness and hence works correctly on both big and little * endian machines. */ -static void mbedtls_gf128mul_x_ble(unsigned char r[16], +MBEDTLS_OPTIMIZE_ALWAYS +static inline void mbedtls_gf128mul_x_ble(unsigned char r[16], const unsigned char x[16]) { uint64_t a, b, ra, rb; @@ -1145,7 +1146,11 @@ static void mbedtls_gf128mul_x_ble(unsigned char r[16], /* * AES-XTS buffer encryption/decryption + * + * Use of MBEDTLS_OPTIMIZE_ALWAYS here and for mbedtls_gf128mul_x_ble() + * is a 3x performance improvement for gcc -Os! */ +MBEDTLS_OPTIMIZE_ALWAYS int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx, int mode, size_t length, From f88a68cf514d68adf2cf63e0f22fc0ffb8faef2b Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 18:46:41 +0100 Subject: [PATCH 08/19] Use MBEDTLS_OPTIMIZE_ALWAYS in aesce Signed-off-by: Dave Rodgman --- library/aesce.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/library/aesce.c b/library/aesce.c index e21e3b39da35..6b493a27292c 100644 --- a/library/aesce.c +++ b/library/aesce.c @@ -101,6 +101,7 @@ int mbedtls_aesce_has_support(void) #endif } +MBEDTLS_OPTIMIZE_ALWAYS static uint8x16_t aesce_encrypt_block(uint8x16_t block, unsigned char *keys, int rounds) @@ -167,6 +168,7 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block, return block; } +MBEDTLS_OPTIMIZE_ALWAYS static uint8x16_t aesce_decrypt_block(uint8x16_t block, unsigned char *keys, int rounds) @@ -249,6 +251,7 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block, /* * AES-ECB block en(de)cryption */ +MBEDTLS_OPTIMIZE_ALWAYS int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx, int mode, const unsigned char input[16], From 9149c321923fbab45a2bfcd048f3661737e9e8e1 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 18:50:21 +0100 Subject: [PATCH 09/19] Use MBEDTLS_OPTIMIZE_ALWAYS for ccm Signed-off-by: Dave Rodgman --- library/ccm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/library/ccm.c b/library/ccm.c index 36c999e7d73c..1e644dc17245 100644 --- a/library/ccm.c +++ b/library/ccm.c @@ -326,6 +326,7 @@ int mbedtls_ccm_update_ad(mbedtls_ccm_context *ctx, return 0; } +MBEDTLS_OPTIMIZE_ALWAYS int mbedtls_ccm_update(mbedtls_ccm_context *ctx, const unsigned char *input, size_t input_len, unsigned char *output, size_t output_size, From 660cd378e182606a4d0760720980035eeb12c48d Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 18:53:35 +0100 Subject: [PATCH 10/19] Use MBEDTLS_OPTIMIZE_ALWAYS for gcm Signed-off-by: Dave Rodgman --- library/gcm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/library/gcm.c b/library/gcm.c index 35823e3d714d..02f8cbfcaad3 100644 --- a/library/gcm.c +++ b/library/gcm.c @@ -417,6 +417,7 @@ static int gcm_mask(mbedtls_gcm_context *ctx, return 0; } +MBEDTLS_OPTIMIZE_ALWAYS int mbedtls_gcm_update(mbedtls_gcm_context *ctx, const unsigned char *input, size_t input_length, unsigned char *output, size_t output_size, From 3650a605869456fc3122c46b0d312c4c0b4f9960 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 18:56:15 +0100 Subject: [PATCH 11/19] Update changelog Signed-off-by: Dave Rodgman --- ChangeLog.d/aes-perf.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/ChangeLog.d/aes-perf.txt b/ChangeLog.d/aes-perf.txt index ca2ced92ed48..696945739b44 100644 --- a/ChangeLog.d/aes-perf.txt +++ b/ChangeLog.d/aes-perf.txt @@ -1,4 +1,4 @@ Features - * AES performance improvements on 64-bit architectures. Uplift - varies by platform, toolchain, optimisation flags and mode, - in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most. + * AES performance improvements. Uplift varies by platform, + toolchain, optimisation flags and mode, up to 4.5x. + Aarch64, gcc -Os and CCM, GCM and XTS benefit the most. From 2dd15b3ab50068fd20371cf228a96a7c49e70baa Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Thu, 15 Jun 2023 20:27:53 +0100 Subject: [PATCH 12/19] code style Signed-off-by: Dave Rodgman --- library/aes.c | 6 +++--- library/common.h | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/library/aes.c b/library/aes.c index 6ec4d78086bc..977b3de2d888 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1083,8 +1083,8 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx, goto exit; } /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on - * the result for the next block in CBC, and the cost of transferring that data from - * NEON registers, NEON is slower on aarch64. */ + * the result for the next block in CBC, and the cost of transferring that data from + * NEON registers, NEON is slower on aarch64. */ mbedtls_xor_no_simd(output, output, iv, 16); memcpy(iv, temp, 16); @@ -1130,7 +1130,7 @@ typedef unsigned char mbedtls_be128[16]; */ MBEDTLS_OPTIMIZE_ALWAYS static inline void mbedtls_gf128mul_x_ble(unsigned char r[16], - const unsigned char x[16]) + const unsigned char x[16]) { uint64_t a, b, ra, rb; diff --git a/library/common.h b/library/common.h index b56ad580c2e5..c477e1d373aa 100644 --- a/library/common.h +++ b/library/common.h @@ -168,7 +168,10 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned * \param b Pointer to input (buffer of at least \p n bytes) * \param n Number of bytes to process. */ -static inline void mbedtls_xor_no_simd(unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n) +static inline void mbedtls_xor_no_simd(unsigned char *r, + const unsigned char *a, + const unsigned char *b, + size_t n) { size_t i = 0; #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) From 48fd2ab5d5e9346a94c16f9c8994114f6640f42f Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 16 Jun 2023 09:36:50 +0100 Subject: [PATCH 13/19] Improve readability of unrolled AESCE code Signed-off-by: Dave Rodgman --- library/aesce.c | 149 ++++++++++++++++++------------------------------ library/aesce.h | 4 +- 2 files changed, 56 insertions(+), 97 deletions(-) diff --git a/library/aesce.c b/library/aesce.c index 6b493a27292c..600326a08aab 100644 --- a/library/aesce.c +++ b/library/aesce.c @@ -101,59 +101,36 @@ int mbedtls_aesce_has_support(void) #endif } +/* Single round of AESCE encryption */ +#define AESCE_ENCRYPT_ROUND \ + block = vaeseq_u8(block, vld1q_u8(keys)); \ + block = vaesmcq_u8(block); \ + keys += 16 +/* Two rounds of AESCE encryption */ +#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND + MBEDTLS_OPTIMIZE_ALWAYS static uint8x16_t aesce_encrypt_block(uint8x16_t block, unsigned char *keys, int rounds) { - /* Assume either 10, 12 or 14 rounds */ + /* Assume either 10, 12 or 14 rounds. + * Skip 4 or 2 rounds, if doing 10 or 12 rounds */ if (rounds == 10) { goto rounds_10; } if (rounds == 12) { goto rounds_12; } - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; + AESCE_ENCRYPT_ROUND_X2; rounds_12: - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; + AESCE_ENCRYPT_ROUND_X2; rounds_10: - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; - block = vaeseq_u8(block, vld1q_u8(keys)); - block = vaesmcq_u8(block); - keys += 16; + AESCE_ENCRYPT_ROUND_X2; + AESCE_ENCRYPT_ROUND_X2; + AESCE_ENCRYPT_ROUND_X2; + AESCE_ENCRYPT_ROUND_X2; + AESCE_ENCRYPT_ROUND; /* AES AddRoundKey for the previous round. * SubBytes, ShiftRows for the final round. */ @@ -168,74 +145,56 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block, return block; } +/* Single round of AESCE decryption + * + * AES AddRoundKey, SubBytes, ShiftRows + * + * block = vaesdq_u8(block, vld1q_u8(keys)); + * + * AES inverse MixColumns for the next round. + * + * This means that we switch the order of the inverse AddRoundKey and + * inverse MixColumns operations. We have to do this as AddRoundKey is + * done in an atomic instruction together with the inverses of SubBytes + * and ShiftRows. + * + * It works because MixColumns is a linear operation over GF(2^8) and + * AddRoundKey is an exclusive or, which is equivalent to addition over + * GF(2^8). (The inverse of MixColumns needs to be applied to the + * affected round keys separately which has been done when the + * decryption round keys were calculated.) + * + * block = vaesimcq_u8(block); + */ +#define AESCE_DECRYPT_ROUND \ + block = vaesdq_u8(block, vld1q_u8(keys)); \ + block = vaesimcq_u8(block); \ + keys += 16 +/* Two rounds of AESCE decryption */ +#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND + MBEDTLS_OPTIMIZE_ALWAYS static uint8x16_t aesce_decrypt_block(uint8x16_t block, unsigned char *keys, int rounds) { - /* Assume either 10, 12 or 14 rounds */ + /* Assume either 10, 12 or 14 rounds. + * Skip 4 or 2 rounds, if doing 10 or 12 rounds */ if (rounds == 10) { goto rounds_10; } if (rounds == 12) { goto rounds_12; } - - /* AES AddRoundKey, SubBytes, ShiftRows */ - block = vaesdq_u8(block, vld1q_u8(keys)); - /* AES inverse MixColumns for the next round. - * - * This means that we switch the order of the inverse AddRoundKey and - * inverse MixColumns operations. We have to do this as AddRoundKey is - * done in an atomic instruction together with the inverses of SubBytes - * and ShiftRows. - * - * It works because MixColumns is a linear operation over GF(2^8) and - * AddRoundKey is an exclusive or, which is equivalent to addition over - * GF(2^8). (The inverse of MixColumns needs to be applied to the - * affected round keys separately which has been done when the - * decryption round keys were calculated.) */ - block = vaesimcq_u8(block); - keys += 16; - - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; + AESCE_DECRYPT_ROUND_X2; rounds_12: - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; + AESCE_DECRYPT_ROUND_X2; rounds_10: - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; - block = vaesdq_u8(block, vld1q_u8(keys)); - block = vaesimcq_u8(block); - keys += 16; + AESCE_DECRYPT_ROUND_X2; + AESCE_DECRYPT_ROUND_X2; + AESCE_DECRYPT_ROUND_X2; + AESCE_DECRYPT_ROUND_X2; + AESCE_DECRYPT_ROUND; /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the * last full round. */ diff --git a/library/aesce.h b/library/aesce.h index 61e73bfddd97..b12bf76ba446 100644 --- a/library/aesce.h +++ b/library/aesce.h @@ -52,8 +52,8 @@ int mbedtls_aesce_has_support(void); /** * \brief Internal AES-ECB block encryption and decryption * - * Note: this assumes that the context specifies either 10, 12 or 14 rounds - * and will behave incorrectly if this is not the case. + * \warning This assumes that the context specifies either 10, 12 or 14 + * rounds and will behave incorrectly if this is not the case. * * \param ctx AES context * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT From 9bb7e6f4ce3aa65f6bdc07a4efd7c5856b181aa3 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 16 Jun 2023 09:41:21 +0100 Subject: [PATCH 14/19] Rename MBEDTLS_OPTIMIZE_ALWAYS Signed-off-by: Dave Rodgman --- library/aes.c | 6 +++--- library/aesce.c | 6 +++--- library/ccm.c | 2 +- library/common.h | 9 ++++++--- library/gcm.c | 2 +- 5 files changed, 14 insertions(+), 11 deletions(-) diff --git a/library/aes.c b/library/aes.c index 977b3de2d888..b446265b255e 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1128,7 +1128,7 @@ typedef unsigned char mbedtls_be128[16]; * for machine endianness and hence works correctly on both big and little * endian machines. */ -MBEDTLS_OPTIMIZE_ALWAYS +MBEDTLS_OPTIMIZE_FOR_PERFORMANCE static inline void mbedtls_gf128mul_x_ble(unsigned char r[16], const unsigned char x[16]) { @@ -1147,10 +1147,10 @@ static inline void mbedtls_gf128mul_x_ble(unsigned char r[16], /* * AES-XTS buffer encryption/decryption * - * Use of MBEDTLS_OPTIMIZE_ALWAYS here and for mbedtls_gf128mul_x_ble() + * Use of MBEDTLS_OPTIMIZE_FOR_PERFORMANCE here and for mbedtls_gf128mul_x_ble() * is a 3x performance improvement for gcc -Os! */ -MBEDTLS_OPTIMIZE_ALWAYS +MBEDTLS_OPTIMIZE_FOR_PERFORMANCE int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx, int mode, size_t length, diff --git a/library/aesce.c b/library/aesce.c index 600326a08aab..ecfadcd9df68 100644 --- a/library/aesce.c +++ b/library/aesce.c @@ -109,7 +109,7 @@ int mbedtls_aesce_has_support(void) /* Two rounds of AESCE encryption */ #define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND -MBEDTLS_OPTIMIZE_ALWAYS +MBEDTLS_OPTIMIZE_FOR_PERFORMANCE static uint8x16_t aesce_encrypt_block(uint8x16_t block, unsigned char *keys, int rounds) @@ -173,7 +173,7 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block, /* Two rounds of AESCE decryption */ #define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND -MBEDTLS_OPTIMIZE_ALWAYS +MBEDTLS_OPTIMIZE_FOR_PERFORMANCE static uint8x16_t aesce_decrypt_block(uint8x16_t block, unsigned char *keys, int rounds) @@ -210,7 +210,7 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block, /* * AES-ECB block en(de)cryption */ -MBEDTLS_OPTIMIZE_ALWAYS +MBEDTLS_OPTIMIZE_FOR_PERFORMANCE int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx, int mode, const unsigned char input[16], diff --git a/library/ccm.c b/library/ccm.c index 1e644dc17245..81bdfe7cc4cd 100644 --- a/library/ccm.c +++ b/library/ccm.c @@ -326,7 +326,7 @@ int mbedtls_ccm_update_ad(mbedtls_ccm_context *ctx, return 0; } -MBEDTLS_OPTIMIZE_ALWAYS +MBEDTLS_OPTIMIZE_FOR_PERFORMANCE int mbedtls_ccm_update(mbedtls_ccm_context *ctx, const unsigned char *input, size_t input_len, unsigned char *output, size_t output_size, diff --git a/library/common.h b/library/common.h index c477e1d373aa..497886f4d5bb 100644 --- a/library/common.h +++ b/library/common.h @@ -240,11 +240,14 @@ static inline void mbedtls_xor_no_simd(unsigned char *r, #define MBEDTLS_COMPILER_IS_GCC #endif -/* If -Os is specified, override with -O2 for a given function */ +/* For gcc -Os, override with -O2 for a given function. + * + * This will not affect behaviour for other optimisation settings, e.g. -O0. + */ #if defined(MBEDTLS_COMPILER_IS_GCC) && defined(__OPTIMIZE_SIZE__) -#define MBEDTLS_OPTIMIZE_ALWAYS __attribute__((optimize("-O2"))) +#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE __attribute__((optimize("-O2"))) #else -#define MBEDTLS_OPTIMIZE_ALWAYS +#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE #endif #endif /* MBEDTLS_LIBRARY_COMMON_H */ diff --git a/library/gcm.c b/library/gcm.c index 02f8cbfcaad3..6d7ef21c41d7 100644 --- a/library/gcm.c +++ b/library/gcm.c @@ -417,7 +417,7 @@ static int gcm_mask(mbedtls_gcm_context *ctx, return 0; } -MBEDTLS_OPTIMIZE_ALWAYS +MBEDTLS_OPTIMIZE_FOR_PERFORMANCE int mbedtls_gcm_update(mbedtls_gcm_context *ctx, const unsigned char *input, size_t input_length, unsigned char *output, size_t output_size, From bd1add94c006be18b9961580b0325269f6120195 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 16 Jun 2023 13:50:14 +0100 Subject: [PATCH 15/19] Respect -Os for everything except XTS Signed-off-by: Dave Rodgman --- library/aesce.c | 2 -- library/ccm.c | 1 - library/gcm.c | 1 - 3 files changed, 4 deletions(-) diff --git a/library/aesce.c b/library/aesce.c index ecfadcd9df68..1f3c83b8f1b4 100644 --- a/library/aesce.c +++ b/library/aesce.c @@ -173,7 +173,6 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block, /* Two rounds of AESCE decryption */ #define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND -MBEDTLS_OPTIMIZE_FOR_PERFORMANCE static uint8x16_t aesce_decrypt_block(uint8x16_t block, unsigned char *keys, int rounds) @@ -210,7 +209,6 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block, /* * AES-ECB block en(de)cryption */ -MBEDTLS_OPTIMIZE_FOR_PERFORMANCE int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx, int mode, const unsigned char input[16], diff --git a/library/ccm.c b/library/ccm.c index 81bdfe7cc4cd..36c999e7d73c 100644 --- a/library/ccm.c +++ b/library/ccm.c @@ -326,7 +326,6 @@ int mbedtls_ccm_update_ad(mbedtls_ccm_context *ctx, return 0; } -MBEDTLS_OPTIMIZE_FOR_PERFORMANCE int mbedtls_ccm_update(mbedtls_ccm_context *ctx, const unsigned char *input, size_t input_len, unsigned char *output, size_t output_size, diff --git a/library/gcm.c b/library/gcm.c index 6d7ef21c41d7..35823e3d714d 100644 --- a/library/gcm.c +++ b/library/gcm.c @@ -417,7 +417,6 @@ static int gcm_mask(mbedtls_gcm_context *ctx, return 0; } -MBEDTLS_OPTIMIZE_FOR_PERFORMANCE int mbedtls_gcm_update(mbedtls_gcm_context *ctx, const unsigned char *input, size_t input_length, unsigned char *output, size_t output_size, From 73b0c0b051fee1a406593ab1ededbcb605ce9a5d Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 16 Jun 2023 14:48:14 +0100 Subject: [PATCH 16/19] Improve comment Signed-off-by: Dave Rodgman --- library/aesce.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/library/aesce.c b/library/aesce.c index 1f3c83b8f1b4..c3aae85e8156 100644 --- a/library/aesce.c +++ b/library/aesce.c @@ -114,8 +114,7 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block, unsigned char *keys, int rounds) { - /* Assume either 10, 12 or 14 rounds. - * Skip 4 or 2 rounds, if doing 10 or 12 rounds */ + /* 10, 12 or 14 rounds. Unroll loop. */ if (rounds == 10) { goto rounds_10; } @@ -177,8 +176,7 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block, unsigned char *keys, int rounds) { - /* Assume either 10, 12 or 14 rounds. - * Skip 4 or 2 rounds, if doing 10 or 12 rounds */ + /* 10, 12 or 14 rounds. Unroll loop. */ if (rounds == 10) { goto rounds_10; } From b2814bd089077b7ec195449a5f7fef316b7b3065 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 16 Jun 2023 14:50:33 +0100 Subject: [PATCH 17/19] Only enable gcc -Os fix if we have AES hw support Signed-off-by: Dave Rodgman --- library/aes.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/library/aes.c b/library/aes.c index b446265b255e..ce458b6f9885 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1148,9 +1148,11 @@ static inline void mbedtls_gf128mul_x_ble(unsigned char r[16], * AES-XTS buffer encryption/decryption * * Use of MBEDTLS_OPTIMIZE_FOR_PERFORMANCE here and for mbedtls_gf128mul_x_ble() - * is a 3x performance improvement for gcc -Os! + * is a 3x performance improvement for gcc -Os, if we have hardware AES support. */ +#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C) MBEDTLS_OPTIMIZE_FOR_PERFORMANCE +#endif int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx, int mode, size_t length, From 4ad81ccdae220ede9b6aee7ef6ba2c2cebbccaf9 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 16 Jun 2023 15:04:04 +0100 Subject: [PATCH 18/19] Only force O2 when hw acceleration available Signed-off-by: Dave Rodgman --- library/aes.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/library/aes.c b/library/aes.c index ce458b6f9885..49c308958836 100644 --- a/library/aes.c +++ b/library/aes.c @@ -1128,7 +1128,9 @@ typedef unsigned char mbedtls_be128[16]; * for machine endianness and hence works correctly on both big and little * endian machines. */ +#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C) MBEDTLS_OPTIMIZE_FOR_PERFORMANCE +#endif static inline void mbedtls_gf128mul_x_ble(unsigned char r[16], const unsigned char x[16]) { From 418843ed6426136642a3433cd34d2883234ff166 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 16 Jun 2023 15:27:23 +0100 Subject: [PATCH 19/19] Improve changelog Signed-off-by: Dave Rodgman --- ChangeLog.d/aes-perf.txt | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/ChangeLog.d/aes-perf.txt b/ChangeLog.d/aes-perf.txt index 696945739b44..ab716bce8c7e 100644 --- a/ChangeLog.d/aes-perf.txt +++ b/ChangeLog.d/aes-perf.txt @@ -1,4 +1,7 @@ Features * AES performance improvements. Uplift varies by platform, - toolchain, optimisation flags and mode, up to 4.5x. + toolchain, optimisation flags and mode. Aarch64, gcc -Os and CCM, GCM and XTS benefit the most. + On Aarch64, uplift is typically around 20 - 110%. + When compiling with gcc -Os on Aarch64, AES-XTS improves + by 4.5x.