Merge pull request #7784 from daverodgman/aesce-unroll

This commit is contained in:
Dave Rodgman 2023-07-04 18:41:13 +01:00 committed by GitHub
commit c8d81ad54d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
7 changed files with 156 additions and 58 deletions

View file

@ -1,4 +1,7 @@
Features Features
* AES performance improvements on 64-bit architectures. Uplift * AES performance improvements. Uplift varies by platform,
varies by platform, toolchain, optimisation flags and mode, toolchain, optimisation flags and mode.
in the 0 - 84% range. Aarch64, gcc and GCM/XTS benefit the most. Aarch64, gcc -Os and CCM, GCM and XTS benefit the most.
On Aarch64, uplift is typically around 20 - 110%.
When compiling with gcc -Os on Aarch64, AES-XTS improves
by 4.5x.

View file

@ -1077,23 +1077,6 @@ int mbedtls_aes_crypt_ecb(mbedtls_aes_context *ctx,
#if defined(MBEDTLS_CIPHER_MODE_CBC) #if defined(MBEDTLS_CIPHER_MODE_CBC)
#if defined(__ARM_NEON) && defined(__aarch64__)
/* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
* the result for the next block in CBC, and the cost of transferring that data from
* NEON registers, it is faster to use the following on aarch64.
* For 32-bit arm, NEON should be faster. */
#define CBC_XOR_16(r, a, b) do { \
mbedtls_put_unaligned_uint64(r, \
mbedtls_get_unaligned_uint64(a) ^ \
mbedtls_get_unaligned_uint64(b)); \
mbedtls_put_unaligned_uint64(r + 8, \
mbedtls_get_unaligned_uint64(a + 8) ^ \
mbedtls_get_unaligned_uint64(b + 8)); \
} while (0)
#else
#define CBC_XOR_16(r, a, b) mbedtls_xor(r, a, b, 16)
#endif
/* /*
* AES-CBC buffer encryption/decryption * AES-CBC buffer encryption/decryption
*/ */
@ -1136,7 +1119,10 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
if (ret != 0) { if (ret != 0) {
goto exit; goto exit;
} }
CBC_XOR_16(output, output, iv); /* Avoid using the NEON implementation of mbedtls_xor. Because of the dependency on
* the result for the next block in CBC, and the cost of transferring that data from
* NEON registers, NEON is slower on aarch64. */
mbedtls_xor_no_simd(output, output, iv, 16);
memcpy(iv, temp, 16); memcpy(iv, temp, 16);
@ -1146,7 +1132,7 @@ int mbedtls_aes_crypt_cbc(mbedtls_aes_context *ctx,
} }
} else { } else {
while (length > 0) { while (length > 0) {
CBC_XOR_16(output, input, ivp); mbedtls_xor_no_simd(output, input, ivp, 16);
ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output); ret = mbedtls_aes_crypt_ecb(ctx, mode, output, output);
if (ret != 0) { if (ret != 0) {
@ -1179,7 +1165,10 @@ typedef unsigned char mbedtls_be128[16];
* for machine endianness and hence works correctly on both big and little * for machine endianness and hence works correctly on both big and little
* endian machines. * endian machines.
*/ */
static void mbedtls_gf128mul_x_ble(unsigned char r[16], #if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C)
MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
#endif
static inline void mbedtls_gf128mul_x_ble(unsigned char r[16],
const unsigned char x[16]) const unsigned char x[16])
{ {
uint64_t a, b, ra, rb; uint64_t a, b, ra, rb;
@ -1196,7 +1185,13 @@ static void mbedtls_gf128mul_x_ble(unsigned char r[16],
/* /*
* AES-XTS buffer encryption/decryption * AES-XTS buffer encryption/decryption
*
* Use of MBEDTLS_OPTIMIZE_FOR_PERFORMANCE here and for mbedtls_gf128mul_x_ble()
* is a 3x performance improvement for gcc -Os, if we have hardware AES support.
*/ */
#if defined(MBEDTLS_AESCE_C) || defined(MBEDTLS_AESNI_C)
MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
#endif
int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx, int mbedtls_aes_crypt_xts(mbedtls_aes_xts_context *ctx,
int mode, int mode,
size_t length, size_t length,

View file

@ -101,39 +101,56 @@ int mbedtls_aesce_has_support(void)
#endif #endif
} }
/* Single round of AESCE encryption */
#define AESCE_ENCRYPT_ROUND \
block = vaeseq_u8(block, vld1q_u8(keys)); \
block = vaesmcq_u8(block); \
keys += 16
/* Two rounds of AESCE encryption */
#define AESCE_ENCRYPT_ROUND_X2 AESCE_ENCRYPT_ROUND; AESCE_ENCRYPT_ROUND
MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
static uint8x16_t aesce_encrypt_block(uint8x16_t block, static uint8x16_t aesce_encrypt_block(uint8x16_t block,
unsigned char *keys, unsigned char *keys,
int rounds) int rounds)
{ {
for (int i = 0; i < rounds - 1; i++) { /* 10, 12 or 14 rounds. Unroll loop. */
/* AES AddRoundKey, SubBytes, ShiftRows (in this order). if (rounds == 10) {
* AddRoundKey adds the round key for the previous round. */ goto rounds_10;
block = vaeseq_u8(block, vld1q_u8(keys + i * 16));
/* AES mix columns */
block = vaesmcq_u8(block);
} }
if (rounds == 12) {
goto rounds_12;
}
AESCE_ENCRYPT_ROUND_X2;
rounds_12:
AESCE_ENCRYPT_ROUND_X2;
rounds_10:
AESCE_ENCRYPT_ROUND_X2;
AESCE_ENCRYPT_ROUND_X2;
AESCE_ENCRYPT_ROUND_X2;
AESCE_ENCRYPT_ROUND_X2;
AESCE_ENCRYPT_ROUND;
/* AES AddRoundKey for the previous round. /* AES AddRoundKey for the previous round.
* SubBytes, ShiftRows for the final round. */ * SubBytes, ShiftRows for the final round. */
block = vaeseq_u8(block, vld1q_u8(keys + (rounds -1) * 16)); block = vaeseq_u8(block, vld1q_u8(keys));
keys += 16;
/* Final round: no MixColumns */ /* Final round: no MixColumns */
/* Final AddRoundKey */ /* Final AddRoundKey */
block = veorq_u8(block, vld1q_u8(keys + rounds * 16)); block = veorq_u8(block, vld1q_u8(keys));
return block; return block;
} }
static uint8x16_t aesce_decrypt_block(uint8x16_t block, /* Single round of AESCE decryption
unsigned char *keys, *
int rounds) * AES AddRoundKey, SubBytes, ShiftRows
{ *
* block = vaesdq_u8(block, vld1q_u8(keys));
for (int i = 0; i < rounds - 1; i++) { *
/* AES AddRoundKey, SubBytes, ShiftRows */ * AES inverse MixColumns for the next round.
block = vaesdq_u8(block, vld1q_u8(keys + i * 16));
/* AES inverse MixColumns for the next round.
* *
* This means that we switch the order of the inverse AddRoundKey and * This means that we switch the order of the inverse AddRoundKey and
* inverse MixColumns operations. We have to do this as AddRoundKey is * inverse MixColumns operations. We have to do this as AddRoundKey is
@ -144,16 +161,45 @@ static uint8x16_t aesce_decrypt_block(uint8x16_t block,
* AddRoundKey is an exclusive or, which is equivalent to addition over * AddRoundKey is an exclusive or, which is equivalent to addition over
* GF(2^8). (The inverse of MixColumns needs to be applied to the * GF(2^8). (The inverse of MixColumns needs to be applied to the
* affected round keys separately which has been done when the * affected round keys separately which has been done when the
* decryption round keys were calculated.) */ * decryption round keys were calculated.)
block = vaesimcq_u8(block); *
* block = vaesimcq_u8(block);
*/
#define AESCE_DECRYPT_ROUND \
block = vaesdq_u8(block, vld1q_u8(keys)); \
block = vaesimcq_u8(block); \
keys += 16
/* Two rounds of AESCE decryption */
#define AESCE_DECRYPT_ROUND_X2 AESCE_DECRYPT_ROUND; AESCE_DECRYPT_ROUND
static uint8x16_t aesce_decrypt_block(uint8x16_t block,
unsigned char *keys,
int rounds)
{
/* 10, 12 or 14 rounds. Unroll loop. */
if (rounds == 10) {
goto rounds_10;
} }
if (rounds == 12) {
goto rounds_12;
}
AESCE_DECRYPT_ROUND_X2;
rounds_12:
AESCE_DECRYPT_ROUND_X2;
rounds_10:
AESCE_DECRYPT_ROUND_X2;
AESCE_DECRYPT_ROUND_X2;
AESCE_DECRYPT_ROUND_X2;
AESCE_DECRYPT_ROUND_X2;
AESCE_DECRYPT_ROUND;
/* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the /* The inverses of AES AddRoundKey, SubBytes, ShiftRows finishing up the
* last full round. */ * last full round. */
block = vaesdq_u8(block, vld1q_u8(keys + (rounds - 1) * 16)); block = vaesdq_u8(block, vld1q_u8(keys));
keys += 16;
/* Inverse AddRoundKey for inverting the initial round key addition. */ /* Inverse AddRoundKey for inverting the initial round key addition. */
block = veorq_u8(block, vld1q_u8(keys + rounds * 16)); block = veorq_u8(block, vld1q_u8(keys));
return block; return block;
} }

View file

@ -52,6 +52,9 @@ int mbedtls_aesce_has_support(void);
/** /**
* \brief Internal AES-ECB block encryption and decryption * \brief Internal AES-ECB block encryption and decryption
* *
* \warning This assumes that the context specifies either 10, 12 or 14
* rounds and will behave incorrectly if this is not the case.
*
* \param ctx AES context * \param ctx AES context
* \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT * \param mode MBEDTLS_AES_ENCRYPT or MBEDTLS_AES_DECRYPT
* \param input 16-byte input block * \param input 16-byte input block

View file

@ -673,14 +673,10 @@
#if defined(__arm__) #if defined(__arm__)
#if defined(__thumb__) && !defined(__thumb2__) #if defined(__thumb__) && !defined(__thumb2__)
#if !defined(__ARMCC_VERSION) && !defined(__clang__) \ #if defined(MBEDTLS_COMPILER_IS_GCC)
&& !defined(__llvm__) && !defined(__INTEL_COMPILER)
/* /*
* Thumb 1 ISA. This code path has only been tested successfully on gcc; * Thumb 1 ISA. This code path has only been tested successfully on gcc;
* it does not compile on clang or armclang. * it does not compile on clang or armclang.
*
* Other compilers which define __GNUC__ may not work. The above macro
* attempts to exclude these untested compilers.
*/ */
#if !defined(__OPTIMIZE__) && defined(__GNUC__) #if !defined(__OPTIMIZE__) && defined(__GNUC__)

View file

@ -237,7 +237,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx,
input, input,
block_size - cmac_ctx->unprocessed_len); block_size - cmac_ctx->unprocessed_len);
mbedtls_xor(state, cmac_ctx->unprocessed_block, state, block_size); mbedtls_xor_no_simd(state, cmac_ctx->unprocessed_block, state, block_size);
if ((ret = mbedtls_cipher_update(ctx, state, block_size, state, if ((ret = mbedtls_cipher_update(ctx, state, block_size, state,
&olen)) != 0) { &olen)) != 0) {
@ -255,7 +255,7 @@ int mbedtls_cipher_cmac_update(mbedtls_cipher_context_t *ctx,
/* Iterate across the input data in block sized chunks, excluding any /* Iterate across the input data in block sized chunks, excluding any
* final partial or complete block */ * final partial or complete block */
for (j = 1; j < n; j++) { for (j = 1; j < n; j++) {
mbedtls_xor(state, input, state, block_size); mbedtls_xor_no_simd(state, input, state, block_size);
if ((ret = mbedtls_cipher_update(ctx, state, block_size, state, if ((ret = mbedtls_cipher_update(ctx, state, block_size, state,
&olen)) != 0) { &olen)) != 0) {

View file

@ -192,6 +192,45 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
} }
} }
/**
* Perform a fast block XOR operation, such that
* r[i] = a[i] ^ b[i] where 0 <= i < n
*
* In some situations, this can perform better than mbedtls_xor (e.g., it's about 5%
* better in AES-CBC).
*
* \param r Pointer to result (buffer of at least \p n bytes). \p r
* may be equal to either \p a or \p b, but behaviour when
* it overlaps in other ways is undefined.
* \param a Pointer to input (buffer of at least \p n bytes)
* \param b Pointer to input (buffer of at least \p n bytes)
* \param n Number of bytes to process.
*/
static inline void mbedtls_xor_no_simd(unsigned char *r,
const unsigned char *a,
const unsigned char *b,
size_t n)
{
size_t i = 0;
#if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
#if defined(__amd64__) || defined(__x86_64__) || defined(__aarch64__)
/* This codepath probably only makes sense on architectures with 64-bit registers */
for (; (i + 8) <= n; i += 8) {
uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
mbedtls_put_unaligned_uint64(r + i, x);
}
#else
for (; (i + 4) <= n; i += 4) {
uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
mbedtls_put_unaligned_uint32(r + i, x);
}
#endif
#endif
for (; i < n; i++) {
r[i] = a[i] ^ b[i];
}
}
/* Fix MSVC C99 compatible issue /* Fix MSVC C99 compatible issue
* MSVC support __func__ from visual studio 2015( 1900 ) * MSVC support __func__ from visual studio 2015( 1900 )
* Use MSVC predefine macro to avoid name check fail. * Use MSVC predefine macro to avoid name check fail.
@ -261,4 +300,20 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
#define MBEDTLS_UNLIKELY(x) x #define MBEDTLS_UNLIKELY(x) x
#endif #endif
#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \
&& !defined(__llvm__) && !defined(__INTEL_COMPILER)
/* Defined if the compiler really is gcc and not clang, etc */
#define MBEDTLS_COMPILER_IS_GCC
#endif
/* For gcc -Os, override with -O2 for a given function.
*
* This will not affect behaviour for other optimisation settings, e.g. -O0.
*/
#if defined(MBEDTLS_COMPILER_IS_GCC) && defined(__OPTIMIZE_SIZE__)
#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE __attribute__((optimize("-O2")))
#else
#define MBEDTLS_OPTIMIZE_FOR_PERFORMANCE
#endif
#endif /* MBEDTLS_LIBRARY_COMMON_H */ #endif /* MBEDTLS_LIBRARY_COMMON_H */