From 96fdfb8e62ccb6249b8c3e64b783dde29be13e46 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 16:21:31 +0100
Subject: [PATCH 01/19] Unroll aesce_encrypt_block

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aesce.c | 58 ++++++++++++++++++++++++++++++++++++++++++-------
 library/aesce.h |  3 +++
 2 files changed, 53 insertions(+), 8 deletions(-)

diff --git a/library/aesce.c b/library/aesce.c
index 4db8d2a1993e..abd47b1d8543 100644
--- a/library/aesce.c
+++ b/library/aesce.c
@@ -105,22 +105,64 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block,
                                       unsigned char *keys,
                                       int rounds)
 {
-    for (int i = 0; i < rounds - 1; i++) {
-        /* AES AddRoundKey, SubBytes, ShiftRows (in this order).
-         * AddRoundKey adds the round key for the previous round. */
-        block = vaeseq_u8(block, vld1q_u8(keys + i * 16));
-        /* AES mix columns */
-        block = vaesmcq_u8(block);
+    /* Assume either 10, 12 or 14 rounds */
+    if (rounds == 10) {
+        goto rounds_10;
+    }
+    if (rounds == 12) {
+        goto rounds_12;
     }
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+rounds_12:
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+rounds_10:
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    block = vaesmcq_u8(block);
+    keys += 16;
 
     /* AES AddRoundKey for the previous round.
      * SubBytes, ShiftRows for the final round.  */
-    block = vaeseq_u8(block, vld1q_u8(keys + (rounds -1) * 16));
+    block = vaeseq_u8(block, vld1q_u8(keys));
+    keys += 16;
 
     /* Final round: no MixColumns */
 
     /* Final AddRoundKey */
-    block = veorq_u8(block, vld1q_u8(keys + rounds  * 16));
+    block = veorq_u8(block, vld1q_u8(keys));
 
     return block;
 }
diff --git a/library/aesce.h b/library/aesce.h
index 7048d77c563d..61e73bfddd97 100644
--- a/library/aesce.h
+++ b/library/aesce.h
@@ -52,6 +52,9 @@ int mbedtls_aesce_has_support(void);
 /**
  * \brief          Internal AES-ECB block encryption and decryption
  *
+ * Note: this assumes that the context specifies either 10, 12 or 14 rounds
+ * and will behave incorrectly if this is not the case.
+ *
  * \param ctx      AES context
  * \param mode     MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
  * \param input    16-byte input block

From 1c4451d089e30be93382d0a3d5faf43db5f893ca Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 16:28:00 +0100
Subject: [PATCH 02/19] Unroll aesce_decrypt_block

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aesce.c | 84 ++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 65 insertions(+), 19 deletions(-)

diff --git a/library/aesce.c b/library/aesce.c
index abd47b1d8543..e21e3b39da35 100644
--- a/library/aesce.c
+++ b/library/aesce.c
@@ -171,31 +171,77 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block,
                                       unsigned char *keys,
                                       int rounds)
 {
-
-    for (int i = 0; i < rounds - 1; i++) {
-        /* AES AddRoundKey, SubBytes, ShiftRows */
-        block = vaesdq_u8(block, vld1q_u8(keys + i * 16));
-        /* AES inverse MixColumns for the next round.
-         *
-         * This means that we switch the order of the inverse AddRoundKey and
-         * inverse MixColumns operations. We have to do this as AddRoundKey is
-         * done in an atomic instruction together with the inverses of SubBytes
-         * and ShiftRows.
-         *
-         * It works because MixColumns is a linear operation over GF(2^8) and
-         * AddRoundKey is an exclusive or, which is equivalent to addition over
-         * GF(2^8). (The inverse of MixColumns needs to be applied to the
-         * affected round keys separately which has been done when the
-         * decryption round keys were calculated.) */
-        block = vaesimcq_u8(block);
+    /* Assume either 10, 12 or 14 rounds */
+    if (rounds == 10) {
+        goto rounds_10;
+    }
+    if (rounds == 12) {
+        goto rounds_12;
     }
 
+    /* AES AddRoundKey, SubBytes, ShiftRows */
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    /* AES inverse MixColumns for the next round.
+     *
+     * This means that we switch the order of the inverse AddRoundKey and
+     * inverse MixColumns operations. We have to do this as AddRoundKey is
+     * done in an atomic instruction together with the inverses of SubBytes
+     * and ShiftRows.
+     *
+     * It works because MixColumns is a linear operation over GF(2^8) and
+     * AddRoundKey is an exclusive or, which is equivalent to addition over
+     * GF(2^8). (The inverse of MixColumns needs to be applied to the
+     * affected round keys separately which has been done when the
+     * decryption round keys were calculated.) */
+    block = vaesimcq_u8(block);
+    keys += 16;
+
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+rounds_12:
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+rounds_10:
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    block = vaesimcq_u8(block);
+    keys += 16;
+
     /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
      * last full round. */
-    block = vaesdq_u8(block, vld1q_u8(keys + (rounds - 1) * 16));
+    block = vaesdq_u8(block, vld1q_u8(keys));
+    keys += 16;
 
     /* Inverse AddRoundKey for inverting the initial round key addition. */
-    block = veorq_u8(block, vld1q_u8(keys + rounds * 16));
+    block = veorq_u8(block, vld1q_u8(keys));
 
     return block;
 }

From 7fdfd70b19c323f18580b602bc924c37bb67da6d Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 18:42:25 +0100
Subject: [PATCH 03/19] Introduce MBEDTLS_COMPILER_IS_GCC

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/bn_mul.h | 6 +-----
 library/common.h | 6 ++++++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/library/bn_mul.h b/library/bn_mul.h
index c5994f704989..aec7f2db87dd 100644
--- a/library/bn_mul.h
+++ b/library/bn_mul.h
@@ -661,14 +661,10 @@
 #if defined(__arm__)
 
 #if defined(__thumb__) && !defined(__thumb2__)
-#if !defined(__ARMCC_VERSION) && !defined(__clang__) \
-    && !defined(__llvm__) && !defined(__INTEL_COMPILER)
+#if defined(MBEDTLS_COMPILER_IS_GCC)
 /*
  * Thumb 1 ISA. This code path has only been tested successfully on gcc;
  * it does not compile on clang or armclang.
- *
- * Other compilers which define __GNUC__ may not work. The above macro
- * attempts to exclude these untested compilers.
  */
 
 #if !defined(__OPTIMIZE__) && defined(__GNUC__)
diff --git a/library/common.h b/library/common.h
index b48a1fc66712..cfbff02ad29a 100644
--- a/library/common.h
+++ b/library/common.h
@@ -195,4 +195,10 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
 #define MBEDTLS_UNLIKELY(x)     x
 #endif
 
+#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \
+    && !defined(__llvm__) && !defined(__INTEL_COMPILER)
+/* Defined if the compiler really is gcc and not clang, etc */
+#define MBEDTLS_COMPILER_IS_GCC
+#endif
+
 #endif /* MBEDTLS_LIBRARY_COMMON_H */

From b055f75c3d4222b00175a1eb5ea96e577acfa052 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 18:42:59 +0100
Subject: [PATCH 04/19] Introduce MBEDTLS_OPTIMIZE_ALWAYS

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/common.h | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/library/common.h b/library/common.h
index cfbff02ad29a..00c2cd360128 100644
--- a/library/common.h
+++ b/library/common.h
@@ -201,4 +201,11 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
 #define MBEDTLS_COMPILER_IS_GCC
 #endif
 
+/* If -Os is specified, override with -O2 for a given function */
+#if defined(MBEDTLS_COMPILER_IS_GCC) && defined(__OPTIMIZE_SIZE__)
+#define MBEDTLS_OPTIMIZE_ALWAYS __attribute__((optimize("-O2")))
+#else
+#define MBEDTLS_OPTIMIZE_ALWAYS
+#endif
+
 #endif /* MBEDTLS_LIBRARY_COMMON_H */

From 03bb526c24a189d8a91f0348eb036baf5491fdf9 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 18:43:24 +0100
Subject: [PATCH 05/19] Add a non-NEON variant of mbedtls_xor

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/common.h | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/library/common.h b/library/common.h
index 00c2cd360128..b56ad580c2e5 100644
--- a/library/common.h
+++ b/library/common.h
@@ -154,6 +154,42 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
     }
 }
 
+/**
+ * Perform a fast block XOR operation, such that
+ * r[i] = a[i] ^ b[i] where 0 <= i < n
+ *
+ * In some situations, this can perform better than mbedtls_xor (e.g., it's about 5%
+ * better in AES-CBC).
+ *
+ * \param   r Pointer to result (buffer of at least \p n bytes). \p r
+ *            may be equal to either \p a or \p b, but behaviour when
+ *            it overlaps in other ways is undefined.
+ * \param   a Pointer to input (buffer of at least \p n bytes)
+ * \param   b Pointer to input (buffer of at least \p n bytes)
+ * \param   n Number of bytes to process.
+ */
+static inline void mbedtls_xor_no_simd(unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n)
+{
+    size_t i = 0;
+#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
+#if defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__)
+    /* This codepath probably only makes sense on architectures with 64-bit registers */
+    for (; (i + 8) <= n; i += 8) {
+        uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
+        mbedtls_put_unaligned_uint64(r + i, x);
+    }
+#else
+    for (; (i + 4) <= n; i += 4) {
+        uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
+        mbedtls_put_unaligned_uint32(r + i, x);
+    }
+#endif
+#endif
+    for (; i < n; i++) {
+        r[i] = a[i] ^ b[i];
+    }
+}
+
 /* Fix MSVC C99 compatible issue
  *      MSVC support __func__ from visual studio 2015( 1900 )
  *      Use MSVC predefine macro to avoid name check fail.

From a0b166e11e8943378109e5167f2745453d7ea877 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 18:44:16 +0100
Subject: [PATCH 06/19] Use mbedtls_xor_no_simd from cmac and cbc

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aes.c  | 24 +++++-------------------
 library/cmac.c |  4 ++--
 2 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/library/aes.c b/library/aes.c
index 0a61d1b07046..d2687bcf3dd1 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -1040,23 +1040,6 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx,
 
 #if defined(MBEDTLS_CIPHER_MODE_CBC)
 
-#if defined(__ARM_NEON) && defined(__aarch64__)
-/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
- * the result for the next block in CBC, and the cost of transferring that data from
- * NEON registers, it is faster to use the following on aarch64.
- * For 32-bit arm, NEON should be faster. */
-#define CBC_XOR_16(r, a, b) do {                                           \
-        mbedtls_put_unaligned_uint64(r,                                    \
-                                     mbedtls_get_unaligned_uint64(a) ^     \
-                                     mbedtls_get_unaligned_uint64(b));     \
-        mbedtls_put_unaligned_uint64(r + 8,                                \
-                                     mbedtls_get_unaligned_uint64(a + 8) ^ \
-                                     mbedtls_get_unaligned_uint64(b + 8)); \
-} while (0)
-#else
-#define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16)
-#endif
-
 /*
  * AES-CBC buffer encryption/decryption
  */
@@ -1099,7 +1082,10 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
             if (ret != 0) {
                 goto exit;
             }
-            CBC_XOR_16(output, output, iv);
+            /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
+            * the result for the next block in CBC, and the cost of transferring that data from
+            * NEON registers, NEON is slower on aarch64. */
+            mbedtls_xor_no_simd(output, output, iv, 16);
 
             memcpy(iv, temp, 16);
 
@@ -1109,7 +1095,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
         }
     } else {
         while (length > 0) {
-            CBC_XOR_16(output, input, ivp);
+            mbedtls_xor_no_simd(output, input, ivp, 16);
 
             ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output);
             if (ret != 0) {
diff --git a/library/cmac.c b/library/cmac.c
index 48f51df41d61..2f19d112975b 100644
--- a/library/cmac.c
+++ b/library/cmac.c
@@ -237,7 +237,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx,
                input,
                block_size - cmac_ctx->unprocessed_len);
 
-        mbedtls_xor(state, cmac_ctx->unprocessed_block, state, block_size);
+        mbedtls_xor_no_simd(state, cmac_ctx->unprocessed_block, state, block_size);
 
         if ((ret = mbedtls_cipher_update(ctx, state, block_size, state,
                                          &olen)) != 0) {
@@ -255,7 +255,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx,
     /* Iterate across the input data in block sized chunks, excluding any
      * final partial or complete block */
     for (j = 1; j < n; j++) {
-        mbedtls_xor(state, input, state, block_size);
+        mbedtls_xor_no_simd(state, input, state, block_size);
 
         if ((ret = mbedtls_cipher_update(ctx, state, block_size, state,
                                          &olen)) != 0) {

From 6cfd9b54ae0d06451c1a46a10e57fa099878bb03 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 18:46:23 +0100
Subject: [PATCH 07/19] use MBEDTLS_OPTIMIZE_ALWAYS in AES-XTS

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aes.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/library/aes.c b/library/aes.c
index d2687bcf3dd1..6ec4d78086bc 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -1128,7 +1128,8 @@ typedef unsigned char mbedtls_be128[16];
  * for machine endianness and hence works correctly on both big and little
  * endian machines.
  */
-static void mbedtls_gf128mul_x_ble(unsigned char r[16],
+MBEDTLS_OPTIMIZE_ALWAYS
+static inline void mbedtls_gf128mul_x_ble(unsigned char r[16],
                                    const unsigned char x[16])
 {
     uint64_t a, b, ra, rb;
@@ -1145,7 +1146,11 @@ static void mbedtls_gf128mul_x_ble(unsigned char r[16],
 
 /*
  * AES-XTS buffer encryption/decryption
+ *
+ * Use of MBEDTLS_OPTIMIZE_ALWAYS here and for mbedtls_gf128mul_x_ble()
+ * is a 3x performance improvement for gcc -Os!
  */
+MBEDTLS_OPTIMIZE_ALWAYS
 int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx,
                           int mode,
                           size_t length,

From f88a68cf514d68adf2cf63e0f22fc0ffb8faef2b Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 18:46:41 +0100
Subject: [PATCH 08/19] Use MBEDTLS_OPTIMIZE_ALWAYS in aesce

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aesce.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/library/aesce.c b/library/aesce.c
index e21e3b39da35..6b493a27292c 100644
--- a/library/aesce.c
+++ b/library/aesce.c
@@ -101,6 +101,7 @@ int mbedtls_aesce_has_support(void)
 #endif
 }
 
+MBEDTLS_OPTIMIZE_ALWAYS
 static uint8x16_t aesce_encrypt_block(uint8x16_t block,
                                       unsigned char *keys,
                                       int rounds)
@@ -167,6 +168,7 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block,
     return block;
 }
 
+MBEDTLS_OPTIMIZE_ALWAYS
 static uint8x16_t aesce_decrypt_block(uint8x16_t block,
                                       unsigned char *keys,
                                       int rounds)
@@ -249,6 +251,7 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block,
 /*
  * AES-ECB block en(de)cryption
  */
+MBEDTLS_OPTIMIZE_ALWAYS
 int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
                             int mode,
                             const unsigned char input[16],

From 9149c321923fbab45a2bfcd048f3661737e9e8e1 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 18:50:21 +0100
Subject: [PATCH 09/19] Use MBEDTLS_OPTIMIZE_ALWAYS for ccm

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/ccm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/library/ccm.c b/library/ccm.c
index 36c999e7d73c..1e644dc17245 100644
--- a/library/ccm.c
+++ b/library/ccm.c
@@ -326,6 +326,7 @@ int mbedtls_ccm_update_ad(mbedtls_ccm_context *ctx,
     return 0;
 }
 
+MBEDTLS_OPTIMIZE_ALWAYS
 int mbedtls_ccm_update(mbedtls_ccm_context *ctx,
                        const unsigned char *input, size_t input_len,
                        unsigned char *output, size_t output_size,

From 660cd378e182606a4d0760720980035eeb12c48d Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 18:53:35 +0100
Subject: [PATCH 10/19] Use MBEDTLS_OPTIMIZE_ALWAYS for gcm

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/gcm.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/library/gcm.c b/library/gcm.c
index 35823e3d714d..02f8cbfcaad3 100644
--- a/library/gcm.c
+++ b/library/gcm.c
@@ -417,6 +417,7 @@ static int gcm_mask(mbedtls_gcm_context *ctx,
     return 0;
 }
 
+MBEDTLS_OPTIMIZE_ALWAYS
 int mbedtls_gcm_update(mbedtls_gcm_context *ctx,
                        const unsigned char *input, size_t input_length,
                        unsigned char *output, size_t output_size,

From 3650a605869456fc3122c46b0d312c4c0b4f9960 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 18:56:15 +0100
Subject: [PATCH 11/19] Update changelog

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 ChangeLog.d/aes-perf.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/ChangeLog.d/aes-perf.txt b/ChangeLog.d/aes-perf.txt
index ca2ced92ed48..696945739b44 100644
--- a/ChangeLog.d/aes-perf.txt
+++ b/ChangeLog.d/aes-perf.txt
@@ -1,4 +1,4 @@
 Features
-   * AES performance improvements on 64-bit architectures. Uplift
-     varies by platform, toolchain, optimisation flags and mode,
-     in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most.
+   * AES performance improvements. Uplift varies by platform,
+     toolchain, optimisation flags and mode, up to 4.5x.
+     Aarch64, gcc -Os and CCM, GCM and XTS benefit the most.

From 2dd15b3ab50068fd20371cf228a96a7c49e70baa Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Thu, 15 Jun 2023 20:27:53 +0100
Subject: [PATCH 12/19] code style

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aes.c    | 6 +++---
 library/common.h | 5 ++++-
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/library/aes.c b/library/aes.c
index 6ec4d78086bc..977b3de2d888 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -1083,8 +1083,8 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
                 goto exit;
             }
             /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
-            * the result for the next block in CBC, and the cost of transferring that data from
-            * NEON registers, NEON is slower on aarch64. */
+             * the result for the next block in CBC, and the cost of transferring that data from
+             * NEON registers, NEON is slower on aarch64. */
             mbedtls_xor_no_simd(output, output, iv, 16);
 
             memcpy(iv, temp, 16);
@@ -1130,7 +1130,7 @@ typedef unsigned char mbedtls_be128[16];
  */
 MBEDTLS_OPTIMIZE_ALWAYS
 static inline void mbedtls_gf128mul_x_ble(unsigned char r[16],
-                                   const unsigned char x[16])
+                                          const unsigned char x[16])
 {
     uint64_t a, b, ra, rb;
 
diff --git a/library/common.h b/library/common.h
index b56ad580c2e5..c477e1d373aa 100644
--- a/library/common.h
+++ b/library/common.h
@@ -168,7 +168,10 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
  * \param   b Pointer to input (buffer of at least \p n bytes)
  * \param   n Number of bytes to process.
  */
-static inline void mbedtls_xor_no_simd(unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n)
+static inline void mbedtls_xor_no_simd(unsigned char *r,
+                                       const unsigned char *a,
+                                       const unsigned char *b,
+                                       size_t n)
 {
     size_t i = 0;
 #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)

From 48fd2ab5d5e9346a94c16f9c8994114f6640f42f Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 16 Jun 2023 09:36:50 +0100
Subject: [PATCH 13/19] Improve readability of unrolled AESCE code

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aesce.c | 149 ++++++++++++++++++------------------------------
 library/aesce.h |   4 +-
 2 files changed, 56 insertions(+), 97 deletions(-)

diff --git a/library/aesce.c b/library/aesce.c
index 6b493a27292c..600326a08aab 100644
--- a/library/aesce.c
+++ b/library/aesce.c
@@ -101,59 +101,36 @@ int mbedtls_aesce_has_support(void)
 #endif
 }
 
+/* Single round of AESCE encryption */
+#define AESCE_ENCRYPT_ROUND                   \
+    block = vaeseq_u8(block, vld1q_u8(keys)); \
+    block = vaesmcq_u8(block);                \
+    keys += 16
+/* Two rounds of AESCE encryption */
+#define AESCE_ENCRYPT_ROUND_X2        AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
+
 MBEDTLS_OPTIMIZE_ALWAYS
 static uint8x16_t aesce_encrypt_block(uint8x16_t block,
                                       unsigned char *keys,
                                       int rounds)
 {
-    /* Assume either 10, 12 or 14 rounds */
+    /* Assume either 10, 12 or 14 rounds.
+     * Skip 4 or 2 rounds, if doing 10 or 12 rounds */
     if (rounds == 10) {
         goto rounds_10;
     }
     if (rounds == 12) {
         goto rounds_12;
     }
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
+    AESCE_ENCRYPT_ROUND_X2;
 rounds_12:
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
+    AESCE_ENCRYPT_ROUND_X2;
 rounds_10:
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
-    block = vaeseq_u8(block, vld1q_u8(keys));
-    block = vaesmcq_u8(block);
-    keys += 16;
+    AESCE_ENCRYPT_ROUND_X2;
+    AESCE_ENCRYPT_ROUND_X2;
+    AESCE_ENCRYPT_ROUND_X2;
+    AESCE_ENCRYPT_ROUND_X2;
+    AESCE_ENCRYPT_ROUND;
 
     /* AES AddRoundKey for the previous round.
      * SubBytes, ShiftRows for the final round.  */
@@ -168,74 +145,56 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block,
     return block;
 }
 
+/* Single round of AESCE decryption
+ *
+ * AES AddRoundKey, SubBytes, ShiftRows
+ *
+ *      block = vaesdq_u8(block, vld1q_u8(keys));
+ *
+ * AES inverse MixColumns for the next round.
+ *
+ * This means that we switch the order of the inverse AddRoundKey and
+ * inverse MixColumns operations. We have to do this as AddRoundKey is
+ * done in an atomic instruction together with the inverses of SubBytes
+ * and ShiftRows.
+ *
+ * It works because MixColumns is a linear operation over GF(2^8) and
+ * AddRoundKey is an exclusive or, which is equivalent to addition over
+ * GF(2^8). (The inverse of MixColumns needs to be applied to the
+ * affected round keys separately which has been done when the
+ * decryption round keys were calculated.)
+ *
+ *      block = vaesimcq_u8(block);
+ */
+#define AESCE_DECRYPT_ROUND                   \
+    block = vaesdq_u8(block, vld1q_u8(keys)); \
+    block = vaesimcq_u8(block);               \
+    keys += 16
+/* Two rounds of AESCE decryption */
+#define AESCE_DECRYPT_ROUND_X2        AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
+
 MBEDTLS_OPTIMIZE_ALWAYS
 static uint8x16_t aesce_decrypt_block(uint8x16_t block,
                                       unsigned char *keys,
                                       int rounds)
 {
-    /* Assume either 10, 12 or 14 rounds */
+    /* Assume either 10, 12 or 14 rounds.
+     * Skip 4 or 2 rounds, if doing 10 or 12 rounds */
     if (rounds == 10) {
         goto rounds_10;
     }
     if (rounds == 12) {
         goto rounds_12;
     }
-
-    /* AES AddRoundKey, SubBytes, ShiftRows */
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    /* AES inverse MixColumns for the next round.
-     *
-     * This means that we switch the order of the inverse AddRoundKey and
-     * inverse MixColumns operations. We have to do this as AddRoundKey is
-     * done in an atomic instruction together with the inverses of SubBytes
-     * and ShiftRows.
-     *
-     * It works because MixColumns is a linear operation over GF(2^8) and
-     * AddRoundKey is an exclusive or, which is equivalent to addition over
-     * GF(2^8). (The inverse of MixColumns needs to be applied to the
-     * affected round keys separately which has been done when the
-     * decryption round keys were calculated.) */
-    block = vaesimcq_u8(block);
-    keys += 16;
-
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
+    AESCE_DECRYPT_ROUND_X2;
 rounds_12:
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
+    AESCE_DECRYPT_ROUND_X2;
 rounds_10:
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
-    block = vaesdq_u8(block, vld1q_u8(keys));
-    block = vaesimcq_u8(block);
-    keys += 16;
+    AESCE_DECRYPT_ROUND_X2;
+    AESCE_DECRYPT_ROUND_X2;
+    AESCE_DECRYPT_ROUND_X2;
+    AESCE_DECRYPT_ROUND_X2;
+    AESCE_DECRYPT_ROUND;
 
     /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
      * last full round. */
diff --git a/library/aesce.h b/library/aesce.h
index 61e73bfddd97..b12bf76ba446 100644
--- a/library/aesce.h
+++ b/library/aesce.h
@@ -52,8 +52,8 @@ int mbedtls_aesce_has_support(void);
 /**
  * \brief          Internal AES-ECB block encryption and decryption
  *
- * Note: this assumes that the context specifies either 10, 12 or 14 rounds
- * and will behave incorrectly if this is not the case.
+ * \warning        This assumes that the context specifies either 10, 12 or 14
+ *                 rounds and will behave incorrectly if this is not the case.
  *
  * \param ctx      AES context
  * \param mode     MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT

From 9bb7e6f4ce3aa65f6bdc07a4efd7c5856b181aa3 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 16 Jun 2023 09:41:21 +0100
Subject: [PATCH 14/19] Rename MBEDTLS_OPTIMIZE_ALWAYS

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aes.c    | 6 +++---
 library/aesce.c  | 6 +++---
 library/ccm.c    | 2 +-
 library/common.h | 9 ++++++---
 library/gcm.c    | 2 +-
 5 files changed, 14 insertions(+), 11 deletions(-)

diff --git a/library/aes.c b/library/aes.c
index 977b3de2d888..b446265b255e 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -1128,7 +1128,7 @@ typedef unsigned char mbedtls_be128[16];
  * for machine endianness and hence works correctly on both big and little
  * endian machines.
  */
-MBEDTLS_OPTIMIZE_ALWAYS
+MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 static inline void mbedtls_gf128mul_x_ble(unsigned char r[16],
                                           const unsigned char x[16])
 {
@@ -1147,10 +1147,10 @@ static inline void mbedtls_gf128mul_x_ble(unsigned char r[16],
 /*
  * AES-XTS buffer encryption/decryption
  *
- * Use of MBEDTLS_OPTIMIZE_ALWAYS here and for mbedtls_gf128mul_x_ble()
+ * Use of MBEDTLS_OPTIMIZE_FOR_PERFORMANCE here and for mbedtls_gf128mul_x_ble()
  * is a 3x performance improvement for gcc -Os!
  */
-MBEDTLS_OPTIMIZE_ALWAYS
+MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx,
                           int mode,
                           size_t length,
diff --git a/library/aesce.c b/library/aesce.c
index 600326a08aab..ecfadcd9df68 100644
--- a/library/aesce.c
+++ b/library/aesce.c
@@ -109,7 +109,7 @@ int mbedtls_aesce_has_support(void)
 /* Two rounds of AESCE encryption */
 #define AESCE_ENCRYPT_ROUND_X2        AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
 
-MBEDTLS_OPTIMIZE_ALWAYS
+MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 static uint8x16_t aesce_encrypt_block(uint8x16_t block,
                                       unsigned char *keys,
                                       int rounds)
@@ -173,7 +173,7 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block,
 /* Two rounds of AESCE decryption */
 #define AESCE_DECRYPT_ROUND_X2        AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
 
-MBEDTLS_OPTIMIZE_ALWAYS
+MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 static uint8x16_t aesce_decrypt_block(uint8x16_t block,
                                       unsigned char *keys,
                                       int rounds)
@@ -210,7 +210,7 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block,
 /*
  * AES-ECB block en(de)cryption
  */
-MBEDTLS_OPTIMIZE_ALWAYS
+MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
                             int mode,
                             const unsigned char input[16],
diff --git a/library/ccm.c b/library/ccm.c
index 1e644dc17245..81bdfe7cc4cd 100644
--- a/library/ccm.c
+++ b/library/ccm.c
@@ -326,7 +326,7 @@ int mbedtls_ccm_update_ad(mbedtls_ccm_context *ctx,
     return 0;
 }
 
-MBEDTLS_OPTIMIZE_ALWAYS
+MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 int mbedtls_ccm_update(mbedtls_ccm_context *ctx,
                        const unsigned char *input, size_t input_len,
                        unsigned char *output, size_t output_size,
diff --git a/library/common.h b/library/common.h
index c477e1d373aa..497886f4d5bb 100644
--- a/library/common.h
+++ b/library/common.h
@@ -240,11 +240,14 @@ static inline void mbedtls_xor_no_simd(unsigned char *r,
 #define MBEDTLS_COMPILER_IS_GCC
 #endif
 
-/* If -Os is specified, override with -O2 for a given function */
+/* For gcc -Os, override with -O2 for a given function.
+ *
+ * This will not affect behaviour for other optimisation settings, e.g. -O0.
+ */
 #if defined(MBEDTLS_COMPILER_IS_GCC) && defined(__OPTIMIZE_SIZE__)
-#define MBEDTLS_OPTIMIZE_ALWAYS __attribute__((optimize("-O2")))
+#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE __attribute__((optimize("-O2")))
 #else
-#define MBEDTLS_OPTIMIZE_ALWAYS
+#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 #endif
 
 #endif /* MBEDTLS_LIBRARY_COMMON_H */
diff --git a/library/gcm.c b/library/gcm.c
index 02f8cbfcaad3..6d7ef21c41d7 100644
--- a/library/gcm.c
+++ b/library/gcm.c
@@ -417,7 +417,7 @@ static int gcm_mask(mbedtls_gcm_context *ctx,
     return 0;
 }
 
-MBEDTLS_OPTIMIZE_ALWAYS
+MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 int mbedtls_gcm_update(mbedtls_gcm_context *ctx,
                        const unsigned char *input, size_t input_length,
                        unsigned char *output, size_t output_size,

From bd1add94c006be18b9961580b0325269f6120195 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 16 Jun 2023 13:50:14 +0100
Subject: [PATCH 15/19] Respect -Os for everything except XTS

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aesce.c | 2 --
 library/ccm.c   | 1 -
 library/gcm.c   | 1 -
 3 files changed, 4 deletions(-)

diff --git a/library/aesce.c b/library/aesce.c
index ecfadcd9df68..1f3c83b8f1b4 100644
--- a/library/aesce.c
+++ b/library/aesce.c
@@ -173,7 +173,6 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block,
 /* Two rounds of AESCE decryption */
 #define AESCE_DECRYPT_ROUND_X2        AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
 
-MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 static uint8x16_t aesce_decrypt_block(uint8x16_t block,
                                       unsigned char *keys,
                                       int rounds)
@@ -210,7 +209,6 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block,
 /*
  * AES-ECB block en(de)cryption
  */
-MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 int mbedtls_aesce_crypt_ecb(mbedtls_aes_context *ctx,
                             int mode,
                             const unsigned char input[16],
diff --git a/library/ccm.c b/library/ccm.c
index 81bdfe7cc4cd..36c999e7d73c 100644
--- a/library/ccm.c
+++ b/library/ccm.c
@@ -326,7 +326,6 @@ int mbedtls_ccm_update_ad(mbedtls_ccm_context *ctx,
     return 0;
 }
 
-MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 int mbedtls_ccm_update(mbedtls_ccm_context *ctx,
                        const unsigned char *input, size_t input_len,
                        unsigned char *output, size_t output_size,
diff --git a/library/gcm.c b/library/gcm.c
index 6d7ef21c41d7..35823e3d714d 100644
--- a/library/gcm.c
+++ b/library/gcm.c
@@ -417,7 +417,6 @@ static int gcm_mask(mbedtls_gcm_context *ctx,
     return 0;
 }
 
-MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 int mbedtls_gcm_update(mbedtls_gcm_context *ctx,
                        const unsigned char *input, size_t input_length,
                        unsigned char *output, size_t output_size,

From 73b0c0b051fee1a406593ab1ededbcb605ce9a5d Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 16 Jun 2023 14:48:14 +0100
Subject: [PATCH 16/19] Improve comment

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aesce.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/library/aesce.c b/library/aesce.c
index 1f3c83b8f1b4..c3aae85e8156 100644
--- a/library/aesce.c
+++ b/library/aesce.c
@@ -114,8 +114,7 @@ static uint8x16_t aesce_encrypt_block(uint8x16_t block,
                                       unsigned char *keys,
                                       int rounds)
 {
-    /* Assume either 10, 12 or 14 rounds.
-     * Skip 4 or 2 rounds, if doing 10 or 12 rounds */
+    /* 10, 12 or 14 rounds. Unroll loop. */
     if (rounds == 10) {
         goto rounds_10;
     }
@@ -177,8 +176,7 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block,
                                       unsigned char *keys,
                                       int rounds)
 {
-    /* Assume either 10, 12 or 14 rounds.
-     * Skip 4 or 2 rounds, if doing 10 or 12 rounds */
+    /* 10, 12 or 14 rounds. Unroll loop. */
     if (rounds == 10) {
         goto rounds_10;
     }

From b2814bd089077b7ec195449a5f7fef316b7b3065 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 16 Jun 2023 14:50:33 +0100
Subject: [PATCH 17/19] Only enable gcc -Os fix if we have AES hw support

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aes.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/library/aes.c b/library/aes.c
index b446265b255e..ce458b6f9885 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -1148,9 +1148,11 @@ static inline void mbedtls_gf128mul_x_ble(unsigned char r[16],
  * AES-XTS buffer encryption/decryption
  *
  * Use of MBEDTLS_OPTIMIZE_FOR_PERFORMANCE here and for mbedtls_gf128mul_x_ble()
- * is a 3x performance improvement for gcc -Os!
+ * is a 3x performance improvement for gcc -Os, if we have hardware AES support.
  */
+#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C)
 MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
+#endif
 int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx,
                           int mode,
                           size_t length,

From 4ad81ccdae220ede9b6aee7ef6ba2c2cebbccaf9 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 16 Jun 2023 15:04:04 +0100
Subject: [PATCH 18/19] Only force O2 when hw acceleration available

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/aes.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/library/aes.c b/library/aes.c
index ce458b6f9885..49c308958836 100644
--- a/library/aes.c
+++ b/library/aes.c
@@ -1128,7 +1128,9 @@ typedef unsigned char mbedtls_be128[16];
  * for machine endianness and hence works correctly on both big and little
  * endian machines.
  */
+#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C)
 MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
+#endif
 static inline void mbedtls_gf128mul_x_ble(unsigned char r[16],
                                           const unsigned char x[16])
 {

From 418843ed6426136642a3433cd34d2883234ff166 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 16 Jun 2023 15:27:23 +0100
Subject: [PATCH 19/19] Improve changelog

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 ChangeLog.d/aes-perf.txt | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/ChangeLog.d/aes-perf.txt b/ChangeLog.d/aes-perf.txt
index 696945739b44..ab716bce8c7e 100644
--- a/ChangeLog.d/aes-perf.txt
+++ b/ChangeLog.d/aes-perf.txt
@@ -1,4 +1,7 @@
 Features
    * AES performance improvements. Uplift varies by platform,
-     toolchain, optimisation flags and mode, up to 4.5x.
+     toolchain, optimisation flags and mode.
      Aarch64, gcc -Os and CCM, GCM and XTS benefit the most.
+     On Aarch64, uplift is typically around 20 - 110%.
+     When compiling with gcc -Os on Aarch64, AES-XTS improves
+     by 4.5x.