Skip to content

Commit

Permalink
Merge pull request Mbed-TLS#7784 from daverodgman/aesce-unroll
Browse files Browse the repository at this point in the history
  • Loading branch information
daverodgman authored Jul 4, 2023
2 parents a2eff62 + 418843e commit c8d81ad
Show file tree
Hide file tree
Showing 7 changed files with 156 additions and 58 deletions.
9 changes: 6 additions & 3 deletions ChangeLog.d/aes-perf.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
Features
* AES performance improvements on 64-bit architectures. Uplift
varies by platform, toolchain, optimisation flags and mode,
in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most.
* AES performance improvements. Uplift varies by platform,
toolchain, optimisation flags and mode.
Aarch64, gcc -Os and CCM, GCM and XTS benefit the most.
On Aarch64, uplift is typically around 20 - 110%.
When compiling with gcc -Os on Aarch64, AES-XTS improves
by 4.5x.
37 changes: 16 additions & 21 deletions library/aes.c
Original file line number Diff line number Diff line change
Expand Up @@ -1077,23 +1077,6 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx,

#if defined(MBEDTLS_CIPHER_MODE_CBC)

#if defined(__ARM_NEON) && defined(__aarch64__)
/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
* the result for the next block in CBC, and the cost of transferring that data from
* NEON registers, it is faster to use the following on aarch64.
* For 32-bit arm, NEON should be faster. */
#define CBC_XOR_16(r, a, b) do { \
mbedtls_put_unaligned_uint64(r, \
mbedtls_get_unaligned_uint64(a) ^ \
mbedtls_get_unaligned_uint64(b)); \
mbedtls_put_unaligned_uint64(r + 8, \
mbedtls_get_unaligned_uint64(a + 8) ^ \
mbedtls_get_unaligned_uint64(b + 8)); \
} while (0)
#else
#define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16)
#endif

/*
* AES-CBC buffer encryption/decryption
*/
Expand Down Expand Up @@ -1136,7 +1119,10 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
if (ret != 0) {
goto exit;
}
CBC_XOR_16(output, output, iv);
/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
* the result for the next block in CBC, and the cost of transferring that data from
* NEON registers, NEON is slower on aarch64. */
mbedtls_xor_no_simd(output, output, iv, 16);

memcpy(iv, temp, 16);

Expand All @@ -1146,7 +1132,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
}
} else {
while (length > 0) {
CBC_XOR_16(output, input, ivp);
mbedtls_xor_no_simd(output, input, ivp, 16);

ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output);
if (ret != 0) {
Expand Down Expand Up @@ -1179,8 +1165,11 @@ typedef unsigned char mbedtls_be128[16];
* for machine endianness and hence works correctly on both big and little
* endian machines.
*/
static void mbedtls_gf128mul_x_ble(unsigned char r[16],
const unsigned char x[16])
#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C)
MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
#endif
static inline void mbedtls_gf128mul_x_ble(unsigned char r[16],
const unsigned char x[16])
{
uint64_t a, b, ra, rb;

Expand All @@ -1196,7 +1185,13 @@ static void mbedtls_gf128mul_x_ble(unsigned char r[16],

/*
* AES-XTS buffer encryption/decryption
*
* Use of MBEDTLS_OPTIMIZE_FOR_PERFORMANCE here and for mbedtls_gf128mul_x_ble()
* is a 3x performance improvement for gcc -Os, if we have hardware AES support.
*/
#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C)
MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
#endif
int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx,
int mode,
size_t length,
Expand Down
100 changes: 73 additions & 27 deletions library/aesce.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,59 +101,105 @@ int mbedtls_aesce_has_support(void)
#endif
}

/* Single round of AESCE encryption */
#define AESCE_ENCRYPT_ROUND \
block = vaeseq_u8(block, vld1q_u8(keys)); \
block = vaesmcq_u8(block); \
keys += 16
/* Two rounds of AESCE encryption */
#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND

MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
static uint8x16_t aesce_encrypt_block(uint8x16_t block,
unsigned char *keys,
int rounds)
{
for (int i = 0; i < rounds - 1; i++) {
/* AES AddRoundKey, SubBytes, ShiftRows (in this order).
* AddRoundKey adds the round key for the previous round. */
block = vaeseq_u8(block, vld1q_u8(keys + i * 16));
/* AES mix columns */
block = vaesmcq_u8(block);
/* 10, 12 or 14 rounds. Unroll loop. */
if (rounds == 10) {
goto rounds_10;
}
if (rounds == 12) {
goto rounds_12;
}
AESCE_ENCRYPT_ROUND_X2;
rounds_12:
AESCE_ENCRYPT_ROUND_X2;
rounds_10:
AESCE_ENCRYPT_ROUND_X2;
AESCE_ENCRYPT_ROUND_X2;
AESCE_ENCRYPT_ROUND_X2;
AESCE_ENCRYPT_ROUND_X2;
AESCE_ENCRYPT_ROUND;

/* AES AddRoundKey for the previous round.
* SubBytes, ShiftRows for the final round. */
block = vaeseq_u8(block, vld1q_u8(keys + (rounds -1) * 16));
block = vaeseq_u8(block, vld1q_u8(keys));
keys += 16;

/* Final round: no MixColumns */

/* Final AddRoundKey */
block = veorq_u8(block, vld1q_u8(keys + rounds * 16));
block = veorq_u8(block, vld1q_u8(keys));

return block;
}

/* Single round of AESCE decryption
*
* AES AddRoundKey, SubBytes, ShiftRows
*
* block = vaesdq_u8(block, vld1q_u8(keys));
*
* AES inverse MixColumns for the next round.
*
* This means that we switch the order of the inverse AddRoundKey and
* inverse MixColumns operations. We have to do this as AddRoundKey is
* done in an atomic instruction together with the inverses of SubBytes
* and ShiftRows.
*
* It works because MixColumns is a linear operation over GF(2^8) and
* AddRoundKey is an exclusive or, which is equivalent to addition over
* GF(2^8). (The inverse of MixColumns needs to be applied to the
* affected round keys separately which has been done when the
* decryption round keys were calculated.)
*
* block = vaesimcq_u8(block);
*/
#define AESCE_DECRYPT_ROUND \
block = vaesdq_u8(block, vld1q_u8(keys)); \
block = vaesimcq_u8(block); \
keys += 16
/* Two rounds of AESCE decryption */
#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND

static uint8x16_t aesce_decrypt_block(uint8x16_t block,
unsigned char *keys,
int rounds)
{

for (int i = 0; i < rounds - 1; i++) {
/* AES AddRoundKey, SubBytes, ShiftRows */
block = vaesdq_u8(block, vld1q_u8(keys + i * 16));
/* AES inverse MixColumns for the next round.
*
* This means that we switch the order of the inverse AddRoundKey and
* inverse MixColumns operations. We have to do this as AddRoundKey is
* done in an atomic instruction together with the inverses of SubBytes
* and ShiftRows.
*
* It works because MixColumns is a linear operation over GF(2^8) and
* AddRoundKey is an exclusive or, which is equivalent to addition over
* GF(2^8). (The inverse of MixColumns needs to be applied to the
* affected round keys separately which has been done when the
* decryption round keys were calculated.) */
block = vaesimcq_u8(block);
/* 10, 12 or 14 rounds. Unroll loop. */
if (rounds == 10) {
goto rounds_10;
}
if (rounds == 12) {
goto rounds_12;
}
AESCE_DECRYPT_ROUND_X2;
rounds_12:
AESCE_DECRYPT_ROUND_X2;
rounds_10:
AESCE_DECRYPT_ROUND_X2;
AESCE_DECRYPT_ROUND_X2;
AESCE_DECRYPT_ROUND_X2;
AESCE_DECRYPT_ROUND_X2;
AESCE_DECRYPT_ROUND;

/* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
* last full round. */
block = vaesdq_u8(block, vld1q_u8(keys + (rounds - 1) * 16));
block = vaesdq_u8(block, vld1q_u8(keys));
keys += 16;

/* Inverse AddRoundKey for inverting the initial round key addition. */
block = veorq_u8(block, vld1q_u8(keys + rounds * 16));
block = veorq_u8(block, vld1q_u8(keys));

return block;
}
Expand Down
3 changes: 3 additions & 0 deletions library/aesce.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ int mbedtls_aesce_has_support(void);
/**
* \brief Internal AES-ECB block encryption and decryption
*
* \warning This assumes that the context specifies either 10, 12 or 14
* rounds and will behave incorrectly if this is not the case.
*
* \param ctx AES context
* \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
* \param input 16-byte input block
Expand Down
6 changes: 1 addition & 5 deletions library/bn_mul.h
Original file line number Diff line number Diff line change
Expand Up @@ -673,14 +673,10 @@
#if defined(__arm__)

#if defined(__thumb__) && !defined(__thumb2__)
#if !defined(__ARMCC_VERSION) && !defined(__clang__) \
&& !defined(__llvm__) && !defined(__INTEL_COMPILER)
#if defined(MBEDTLS_COMPILER_IS_GCC)
/*
* Thumb 1 ISA. This code path has only been tested successfully on gcc;
* it does not compile on clang or armclang.
*
* Other compilers which define __GNUC__ may not work. The above macro
* attempts to exclude these untested compilers.
*/

#if !defined(__OPTIMIZE__) && defined(__GNUC__)
Expand Down
4 changes: 2 additions & 2 deletions library/cmac.c
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx,
input,
block_size - cmac_ctx->unprocessed_len);

mbedtls_xor(state, cmac_ctx->unprocessed_block, state, block_size);
mbedtls_xor_no_simd(state, cmac_ctx->unprocessed_block, state, block_size);

if ((ret = mbedtls_cipher_update(ctx, state, block_size, state,
&olen)) != 0) {
Expand All @@ -255,7 +255,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx,
/* Iterate across the input data in block sized chunks, excluding any
* final partial or complete block */
for (j = 1; j < n; j++) {
mbedtls_xor(state, input, state, block_size);
mbedtls_xor_no_simd(state, input, state, block_size);

if ((ret = mbedtls_cipher_update(ctx, state, block_size, state,
&olen)) != 0) {
Expand Down
55 changes: 55 additions & 0 deletions library/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -192,6 +192,45 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
}
}

/**
* Perform a fast block XOR operation, such that
* r[i] = a[i] ^ b[i] where 0 <= i < n
*
* In some situations, this can perform better than mbedtls_xor (e.g., it's about 5%
* better in AES-CBC).
*
* \param r Pointer to result (buffer of at least \p n bytes). \p r
* may be equal to either \p a or \p b, but behaviour when
* it overlaps in other ways is undefined.
* \param a Pointer to input (buffer of at least \p n bytes)
* \param b Pointer to input (buffer of at least \p n bytes)
* \param n Number of bytes to process.
*/
static inline void mbedtls_xor_no_simd(unsigned char *r,
const unsigned char *a,
const unsigned char *b,
size_t n)
{
size_t i = 0;
#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
#if defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__)
/* This codepath probably only makes sense on architectures with 64-bit registers */
for (; (i + 8) <= n; i += 8) {
uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
mbedtls_put_unaligned_uint64(r + i, x);
}
#else
for (; (i + 4) <= n; i += 4) {
uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
mbedtls_put_unaligned_uint32(r + i, x);
}
#endif
#endif
for (; i < n; i++) {
r[i] = a[i] ^ b[i];
}
}

/* Fix MSVC C99 compatible issue
* MSVC support __func__ from visual studio 2015( 1900 )
* Use MSVC predefine macro to avoid name check fail.
Expand Down Expand Up @@ -261,4 +300,20 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
#define MBEDTLS_UNLIKELY(x) x
#endif

#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \
&& !defined(__llvm__) && !defined(__INTEL_COMPILER)
/* Defined if the compiler really is gcc and not clang, etc */
#define MBEDTLS_COMPILER_IS_GCC
#endif

/* For gcc -Os, override with -O2 for a given function.
*
* This will not affect behaviour for other optimisation settings, e.g. -O0.
*/
#if defined(MBEDTLS_COMPILER_IS_GCC) && defined(__OPTIMIZE_SIZE__)
#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE __attribute__((optimize("-O2")))
#else
#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
#endif

#endif /* MBEDTLS_LIBRARY_COMMON_H */

0 comments on commit c8d81ad

Please sign in to comment.