Merge pull request #7784 from daverodgman/aesce-unroll

2023-07-04 18:41:13 +01:00 · 2023-07-04 18:41:13 +01:00 · c8d81ad54d
commit c8d81ad54d
parent a2eff629fc 418843ed64
7 changed files with 156 additions and 58 deletions
--- a/ChangeLog.d/aes-perf.txt
+++ b/ChangeLog.d/aes-perf.txt
@ -1,4 +1,7 @@
 Features
-   * AES performance improvements on 64-bit architectures. Uplift
+   * AES performance improvements. Uplift varies by platform,
-     varies by platform, toolchain, optimisation flags and mode,
+     toolchain, optimisation flags and mode.
-     in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most.
+     Aarch64, gcc -Os and CCM, GCM and XTS benefit the most.
     On Aarch64, uplift is typically around 20 - 110%.
     When compiling with gcc -Os on Aarch64, AES-XTS improves
     by 4.5x.
--- a/library/aes.c
+++ b/library/aes.c
@ -1077,23 +1077,6 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx,
 #if defined(MBEDTLS_CIPHER_MODE_CBC)
 #if defined(__ARM_NEON) && defined(__aarch64__)
 /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
 * the result for the next block in CBC, and the cost of transferring that data from
 * NEON registers, it is faster to use the following on aarch64.
 * For 32-bit arm, NEON should be faster. */
 #define CBC_XOR_16(r, a, b) do {                                           \
        mbedtls_put_unaligned_uint64(r,                                    \
                                     mbedtls_get_unaligned_uint64(a) ^     \
                                     mbedtls_get_unaligned_uint64(b));     \
        mbedtls_put_unaligned_uint64(r + 8,                                \
                                     mbedtls_get_unaligned_uint64(a + 8) ^ \
                                     mbedtls_get_unaligned_uint64(b + 8)); \
 } while (0)
 #else
 #define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16)
 #endif
 /*
 * AES-CBC buffer encryption/decryption
 */
@ -1136,7 +1119,10 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
            if (ret != 0) {
                goto exit;
            }
-            CBC_XOR_16(output, output, iv);
+            /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
             * the result for the next block in CBC, and the cost of transferring that data from
             * NEON registers, NEON is slower on aarch64. */
            mbedtls_xor_no_simd(output, output, iv, 16);
            memcpy(iv, temp, 16);
@ -1146,7 +1132,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
        }
    } else {
        while (length > 0) {
-            CBC_XOR_16(output, input, ivp);
+            mbedtls_xor_no_simd(output, input, ivp, 16);
            ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output);
            if (ret != 0) {
@ -1179,7 +1165,10 @@ typedef unsigned char mbedtls_be128[16];
 * for machine endianness and hence works correctly on both big and little
 * endian machines.
 */
-static void mbedtls_gf128mul_x_ble(unsigned char r[16],
+#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C)
 MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 #endif
 static inline void mbedtls_gf128mul_x_ble(unsigned char r[16],
                                          const unsigned char x[16])
 {
    uint64_t a, b, ra, rb;
@ -1196,7 +1185,13 @@ static void mbedtls_gf128mul_x_ble(unsigned char r[16],
 /*
 * AES-XTS buffer encryption/decryption
 *
 * Use of MBEDTLS_OPTIMIZE_FOR_PERFORMANCE here and for mbedtls_gf128mul_x_ble()
 * is a 3x performance improvement for gcc -Os, if we have hardware AES support.
 */
 #if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C)
 MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 #endif
 int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx,
                          int mode,
                          size_t length,
--- a/library/aesce.c
+++ b/library/aesce.c
@ -101,39 +101,56 @@ int mbedtls_aesce_has_support(void)
 #endif
 }
 /* Single round of AESCE encryption */
 #define AESCE_ENCRYPT_ROUND                   \
    block = vaeseq_u8(block, vld1q_u8(keys)); \
    block = vaesmcq_u8(block);                \
    keys += 16
 /* Two rounds of AESCE encryption */
 #define AESCE_ENCRYPT_ROUND_X2        AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
 MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 static uint8x16_t aesce_encrypt_block(uint8x16_t block,
                                      unsigned char *keys,
                                      int rounds)
 {
-    for (int i = 0; i < rounds - 1; i++) {
+    /* 10, 12 or 14 rounds. Unroll loop. */
-        /* AES AddRoundKey, SubBytes, ShiftRows (in this order).
+    if (rounds == 10) {
-         * AddRoundKey adds the round key for the previous round. */
+        goto rounds_10;
        block = vaeseq_u8(block, vld1q_u8(keys + i * 16));
        /* AES mix columns */
        block = vaesmcq_u8(block);
    }
    if (rounds == 12) {
        goto rounds_12;
    }
    AESCE_ENCRYPT_ROUND_X2;
 rounds_12:
    AESCE_ENCRYPT_ROUND_X2;
 rounds_10:
    AESCE_ENCRYPT_ROUND_X2;
    AESCE_ENCRYPT_ROUND_X2;
    AESCE_ENCRYPT_ROUND_X2;
    AESCE_ENCRYPT_ROUND_X2;
    AESCE_ENCRYPT_ROUND;
    /* AES AddRoundKey for the previous round.
     * SubBytes, ShiftRows for the final round.  */
-    block = vaeseq_u8(block, vld1q_u8(keys + (rounds -1) * 16));
+    block = vaeseq_u8(block, vld1q_u8(keys));
    keys += 16;
    /* Final round: no MixColumns */
    /* Final AddRoundKey */
-    block = veorq_u8(block, vld1q_u8(keys + rounds  * 16));
+    block = veorq_u8(block, vld1q_u8(keys));
    return block;
 }
-static uint8x16_t aesce_decrypt_block(uint8x16_t block,
+/* Single round of AESCE decryption
-                                      unsigned char *keys,
+ *
-                                      int rounds)
+ * AES AddRoundKey, SubBytes, ShiftRows
-{
+ *
-
+ *      block = vaesdq_u8(block, vld1q_u8(keys));
-    for (int i = 0; i < rounds - 1; i++) {
+ *
-        /* AES AddRoundKey, SubBytes, ShiftRows */
+ * AES inverse MixColumns for the next round.
        block = vaesdq_u8(block, vld1q_u8(keys + i * 16));
        /* AES inverse MixColumns for the next round.
 *
 * This means that we switch the order of the inverse AddRoundKey and
 * inverse MixColumns operations. We have to do this as AddRoundKey is
@ -144,16 +161,45 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block,
 * AddRoundKey is an exclusive or, which is equivalent to addition over
 * GF(2^8). (The inverse of MixColumns needs to be applied to the
 * affected round keys separately which has been done when the
-         * decryption round keys were calculated.) */
+ * decryption round keys were calculated.)
-        block = vaesimcq_u8(block);
+ *
 *      block = vaesimcq_u8(block);
 */
 #define AESCE_DECRYPT_ROUND                   \
    block = vaesdq_u8(block, vld1q_u8(keys)); \
    block = vaesimcq_u8(block);               \
    keys += 16
 /* Two rounds of AESCE decryption */
 #define AESCE_DECRYPT_ROUND_X2        AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
 static uint8x16_t aesce_decrypt_block(uint8x16_t block,
                                      unsigned char *keys,
                                      int rounds)
 {
    /* 10, 12 or 14 rounds. Unroll loop. */
    if (rounds == 10) {
        goto rounds_10;
    }
    if (rounds == 12) {
        goto rounds_12;
    }
    AESCE_DECRYPT_ROUND_X2;
 rounds_12:
    AESCE_DECRYPT_ROUND_X2;
 rounds_10:
    AESCE_DECRYPT_ROUND_X2;
    AESCE_DECRYPT_ROUND_X2;
    AESCE_DECRYPT_ROUND_X2;
    AESCE_DECRYPT_ROUND_X2;
    AESCE_DECRYPT_ROUND;
    /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
     * last full round. */
-    block = vaesdq_u8(block, vld1q_u8(keys + (rounds - 1) * 16));
+    block = vaesdq_u8(block, vld1q_u8(keys));
    keys += 16;
    /* Inverse AddRoundKey for inverting the initial round key addition. */
-    block = veorq_u8(block, vld1q_u8(keys + rounds * 16));
+    block = veorq_u8(block, vld1q_u8(keys));
    return block;
 }
--- a/library/aesce.h
+++ b/library/aesce.h
@ -52,6 +52,9 @@ int mbedtls_aesce_has_support(void);
 /**
 * \brief          Internal AES-ECB block encryption and decryption
 *
 * \warning        This assumes that the context specifies either 10, 12 or 14
 *                 rounds and will behave incorrectly if this is not the case.
 *
 * \param ctx      AES context
 * \param mode     MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
 * \param input    16-byte input block
--- a/library/bn_mul.h
+++ b/library/bn_mul.h
@ -673,14 +673,10 @@
 #if defined(__arm__)
 #if defined(__thumb__) && !defined(__thumb2__)
-#if !defined(__ARMCC_VERSION) && !defined(__clang__) \
+#if defined(MBEDTLS_COMPILER_IS_GCC)
    && !defined(__llvm__) && !defined(__INTEL_COMPILER)
 /*
 * Thumb 1 ISA. This code path has only been tested successfully on gcc;
 * it does not compile on clang or armclang.
 *
 * Other compilers which define __GNUC__ may not work. The above macro
 * attempts to exclude these untested compilers.
 */
 #if !defined(__OPTIMIZE__) && defined(__GNUC__)
--- a/library/cmac.c
+++ b/library/cmac.c
@ -237,7 +237,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx,
               input,
               block_size - cmac_ctx->unprocessed_len);
-        mbedtls_xor(state, cmac_ctx->unprocessed_block, state, block_size);
+        mbedtls_xor_no_simd(state, cmac_ctx->unprocessed_block, state, block_size);
        if ((ret = mbedtls_cipher_update(ctx, state, block_size, state,
                                         &olen)) != 0) {
@ -255,7 +255,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx,
    /* Iterate across the input data in block sized chunks, excluding any
     * final partial or complete block */
    for (j = 1; j < n; j++) {
-        mbedtls_xor(state, input, state, block_size);
+        mbedtls_xor_no_simd(state, input, state, block_size);
        if ((ret = mbedtls_cipher_update(ctx, state, block_size, state,
                                         &olen)) != 0) {
--- a/library/common.h
+++ b/library/common.h
@ -192,6 +192,45 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
    }
 }
 /**
 * Perform a fast block XOR operation, such that
 * r[i] = a[i] ^ b[i] where 0 <= i < n
 *
 * In some situations, this can perform better than mbedtls_xor (e.g., it's about 5%
 * better in AES-CBC).
 *
 * \param   r Pointer to result (buffer of at least \p n bytes). \p r
 *            may be equal to either \p a or \p b, but behaviour when
 *            it overlaps in other ways is undefined.
 * \param   a Pointer to input (buffer of at least \p n bytes)
 * \param   b Pointer to input (buffer of at least \p n bytes)
 * \param   n Number of bytes to process.
 */
 static inline void mbedtls_xor_no_simd(unsigned char *r,
                                       const unsigned char *a,
                                       const unsigned char *b,
                                       size_t n)
 {
    size_t i = 0;
 #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
 #if defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__)
    /* This codepath probably only makes sense on architectures with 64-bit registers */
    for (; (i + 8) <= n; i += 8) {
        uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
        mbedtls_put_unaligned_uint64(r + i, x);
    }
 #else
    for (; (i + 4) <= n; i += 4) {
        uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
        mbedtls_put_unaligned_uint32(r + i, x);
    }
 #endif
 #endif
    for (; i < n; i++) {
        r[i] = a[i] ^ b[i];
    }
 }
 /* Fix MSVC C99 compatible issue
 *      MSVC support __func__ from visual studio 2015( 1900 )
 *      Use MSVC predefine macro to avoid name check fail.
@ -261,4 +300,20 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
 #define MBEDTLS_UNLIKELY(x)     x
 #endif
 #if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \
    && !defined(__llvm__) && !defined(__INTEL_COMPILER)
 /* Defined if the compiler really is gcc and not clang, etc */
 #define MBEDTLS_COMPILER_IS_GCC
 #endif
 /* For gcc -Os, override with -O2 for a given function.
 *
 * This will not affect behaviour for other optimisation settings, e.g. -O0.
 */
 #if defined(MBEDTLS_COMPILER_IS_GCC) && defined(__OPTIMIZE_SIZE__)
 #define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE __attribute__((optimize("-O2")))
 #else
 #define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
 #endif
 #endif /* MBEDTLS_LIBRARY_COMMON_H */