diff --git a/ChangeLog.d/gcm-large-tables.txt b/ChangeLog.d/gcm-large-tables.txt new file mode 100644 index 000000000..f9bba5b4e --- /dev/null +++ b/ChangeLog.d/gcm-large-tables.txt @@ -0,0 +1,4 @@ +Features + * Add support for 8-bit GCM tables for Shoup's algorithm to speedup GCM + operations when hardware accelerated AES is not present. Improves + performance by around 30% on 64-bit Intel; 125% on Armv7-M. diff --git a/include/mbedtls/gcm.h b/include/mbedtls/gcm.h index 631b392fd..98faa4361 100644 --- a/include/mbedtls/gcm.h +++ b/include/mbedtls/gcm.h @@ -46,6 +46,12 @@ extern "C" { #if !defined(MBEDTLS_GCM_ALT) +#if defined(MBEDTLS_GCM_LARGE_TABLE) +#define MBEDTLS_GCM_HTABLE_SIZE 256 +#else +#define MBEDTLS_GCM_HTABLE_SIZE 16 +#endif + /** * \brief The GCM context structure. */ @@ -53,18 +59,18 @@ typedef struct mbedtls_gcm_context { #if defined(MBEDTLS_BLOCK_CIPHER_C) mbedtls_block_cipher_context_t MBEDTLS_PRIVATE(block_cipher_ctx); /*!< The cipher context used. */ #else - mbedtls_cipher_context_t MBEDTLS_PRIVATE(cipher_ctx); /*!< The cipher context used. */ + mbedtls_cipher_context_t MBEDTLS_PRIVATE(cipher_ctx); /*!< The cipher context used. */ #endif - uint64_t MBEDTLS_PRIVATE(HL)[16]; /*!< Precalculated HTable low. */ - uint64_t MBEDTLS_PRIVATE(HH)[16]; /*!< Precalculated HTable high. */ - uint64_t MBEDTLS_PRIVATE(len); /*!< The total length of the encrypted data. */ - uint64_t MBEDTLS_PRIVATE(add_len); /*!< The total length of the additional data. */ - unsigned char MBEDTLS_PRIVATE(base_ectr)[16]; /*!< The first ECTR for tag. */ - unsigned char MBEDTLS_PRIVATE(y)[16]; /*!< The Y working value. */ - unsigned char MBEDTLS_PRIVATE(buf)[16]; /*!< The buf working value. */ - int MBEDTLS_PRIVATE(mode); /*!< The operation to perform: - #MBEDTLS_GCM_ENCRYPT or - #MBEDTLS_GCM_DECRYPT. */ + uint64_t MBEDTLS_PRIVATE(H)[MBEDTLS_GCM_HTABLE_SIZE][2]; /*!< Precalculated HTable. */ + uint64_t MBEDTLS_PRIVATE(len); /*!< The total length of the encrypted data. */ + uint64_t MBEDTLS_PRIVATE(add_len); /*!< The total length of the additional data. */ + unsigned char MBEDTLS_PRIVATE(base_ectr)[16]; /*!< The first ECTR for tag. */ + unsigned char MBEDTLS_PRIVATE(y)[16]; /*!< The Y working value. */ + unsigned char MBEDTLS_PRIVATE(buf)[16]; /*!< The buf working value. */ + unsigned char MBEDTLS_PRIVATE(mode); /*!< The operation to perform: + #MBEDTLS_GCM_ENCRYPT or + #MBEDTLS_GCM_DECRYPT. */ + unsigned char MBEDTLS_PRIVATE(acceleration); /*!< The acceleration to use. */ } mbedtls_gcm_context; diff --git a/include/mbedtls/mbedtls_config.h b/include/mbedtls/mbedtls_config.h index 254e75a1a..7cf4153b1 100644 --- a/include/mbedtls/mbedtls_config.h +++ b/include/mbedtls/mbedtls_config.h @@ -2800,6 +2800,22 @@ */ #define MBEDTLS_GCM_C +/** + * \def MBEDTLS_GCM_LARGE_TABLE + * + * Enable large pre-computed tables for Galois/Counter Mode (GCM). + * Can significantly increase throughput on systems without GCM hardware + * acceleration (e.g., AESNI, AESCE). + * + * The mbedtls_gcm_context size will increase by 3840 bytes. + * The code size will increase by roughly 344 bytes. + * + * Module: library/gcm.c + * + * Requires: MBEDTLS_GCM_C + */ +//#define MBEDTLS_GCM_LARGE_TABLE + /** * \def MBEDTLS_HKDF_C * diff --git a/library/gcm.c b/library/gcm.c index 033cb5901..90c1d1d4e 100644 --- a/library/gcm.c +++ b/library/gcm.c @@ -41,6 +41,12 @@ #if !defined(MBEDTLS_GCM_ALT) +/* Used to select the acceleration mechanism */ +#define MBEDTLS_GCM_ACC_SMALLTABLE 0 +#define MBEDTLS_GCM_ACC_LARGETABLE 1 +#define MBEDTLS_GCM_ACC_AESNI 2 +#define MBEDTLS_GCM_ACC_AESCE 3 + /* * Initialize a context */ @@ -49,6 +55,39 @@ void mbedtls_gcm_init(mbedtls_gcm_context *ctx) memset(ctx, 0, sizeof(mbedtls_gcm_context)); } +static inline void gcm_set_acceleration(mbedtls_gcm_context *ctx) +{ +#if defined(MBEDTLS_GCM_LARGE_TABLE) + ctx->acceleration = MBEDTLS_GCM_ACC_LARGETABLE; +#else + ctx->acceleration = MBEDTLS_GCM_ACC_SMALLTABLE; +#endif + +#if defined(MBEDTLS_AESNI_HAVE_CODE) + /* With CLMUL support, we need only h, not the rest of the table */ + if (mbedtls_aesni_has_support(MBEDTLS_AESNI_CLMUL)) { + ctx->acceleration = MBEDTLS_GCM_ACC_AESNI; + } +#endif + +#if defined(MBEDTLS_AESCE_HAVE_CODE) + if (MBEDTLS_AESCE_HAS_SUPPORT()) { + ctx->acceleration = MBEDTLS_GCM_ACC_AESCE; + } +#endif +} + +static inline void gcm_gen_table_rightshift(uint64_t dst[2], const uint64_t src[2]) +{ + uint8_t *u8Dst = (uint8_t *) dst; + uint8_t *u8Src = (uint8_t *) src; + + MBEDTLS_PUT_UINT64_BE(MBEDTLS_GET_UINT64_BE(&src[1], 0) >> 1, &dst[1], 0); + u8Dst[8] |= (u8Src[7] & 0x01) << 7; + MBEDTLS_PUT_UINT64_BE(MBEDTLS_GET_UINT64_BE(&src[0], 0) >> 1, &dst[0], 0); + u8Dst[0] ^= (u8Src[15] & 0x01) ? 0xE1 : 0; +} + /* * Precompute small multiples of H, that is set * HH[i] || HL[i] = H times i, @@ -60,11 +99,8 @@ void mbedtls_gcm_init(mbedtls_gcm_context *ctx) static int gcm_gen_table(mbedtls_gcm_context *ctx) { int ret, i, j; - uint64_t hi, lo; - uint64_t vl, vh; - unsigned char h[16]; - - memset(h, 0, 16); + uint64_t u64h[2] = { 0 }; + uint8_t *h = (uint8_t *) u64h; #if defined(MBEDTLS_BLOCK_CIPHER_C) ret = mbedtls_block_cipher_encrypt(&ctx->block_cipher_ctx, h, h); @@ -76,53 +112,48 @@ static int gcm_gen_table(mbedtls_gcm_context *ctx) return ret; } - /* pack h as two 64-bits ints, big-endian */ - hi = MBEDTLS_GET_UINT32_BE(h, 0); - lo = MBEDTLS_GET_UINT32_BE(h, 4); - vh = (uint64_t) hi << 32 | lo; + gcm_set_acceleration(ctx); - hi = MBEDTLS_GET_UINT32_BE(h, 8); - lo = MBEDTLS_GET_UINT32_BE(h, 12); - vl = (uint64_t) hi << 32 | lo; - - /* 8 = 1000 corresponds to 1 in GF(2^128) */ - ctx->HL[8] = vl; - ctx->HH[8] = vh; + /* MBEDTLS_GCM_HTABLE_SIZE/2 = 1000 corresponds to 1 in GF(2^128) */ + ctx->H[MBEDTLS_GCM_HTABLE_SIZE/2][0] = u64h[0]; + ctx->H[MBEDTLS_GCM_HTABLE_SIZE/2][1] = u64h[1]; + switch (ctx->acceleration) { #if defined(MBEDTLS_AESNI_HAVE_CODE) - /* With CLMUL support, we need only h, not the rest of the table */ - if (mbedtls_aesni_has_support(MBEDTLS_AESNI_CLMUL)) { - return 0; - } + case MBEDTLS_GCM_ACC_AESNI: + return 0; #endif #if defined(MBEDTLS_AESCE_HAVE_CODE) - if (MBEDTLS_AESCE_HAS_SUPPORT()) { - return 0; - } + case MBEDTLS_GCM_ACC_AESCE: + return 0; #endif - /* 0 corresponds to 0 in GF(2^128) */ - ctx->HH[0] = 0; - ctx->HL[0] = 0; + default: + /* 0 corresponds to 0 in GF(2^128) */ + ctx->H[0][0] = 0; + ctx->H[0][1] = 0; - for (i = 4; i > 0; i >>= 1) { - uint32_t T = (vl & 1) * 0xe1000000U; - vl = (vh << 63) | (vl >> 1); - vh = (vh >> 1) ^ ((uint64_t) T << 32); + for (i = MBEDTLS_GCM_HTABLE_SIZE/4; i > 0; i >>= 1) { + gcm_gen_table_rightshift(ctx->H[i], ctx->H[i*2]); + } - ctx->HL[i] = vl; - ctx->HH[i] = vh; - } +#if !defined(MBEDTLS_GCM_LARGE_TABLE) + /* pack elements of H as 64-bits ints, big-endian */ + for (i = MBEDTLS_GCM_HTABLE_SIZE/2; i > 0; i >>= 1) { + MBEDTLS_PUT_UINT64_BE(ctx->H[i][0], &ctx->H[i][0], 0); + MBEDTLS_PUT_UINT64_BE(ctx->H[i][1], &ctx->H[i][1], 0); + } +#endif - for (i = 2; i <= 8; i *= 2) { - uint64_t *HiL = ctx->HL + i, *HiH = ctx->HH + i; - vh = *HiH; - vl = *HiL; - for (j = 1; j < i; j++) { - HiH[j] = vh ^ ctx->HH[j]; - HiL[j] = vl ^ ctx->HL[j]; - } + for (i = 2; i < MBEDTLS_GCM_HTABLE_SIZE; i <<= 1) { + for (j = 1; j < i; j++) { + mbedtls_xor_no_simd((unsigned char *) ctx->H[i+j], + (unsigned char *) ctx->H[i], + (unsigned char *) ctx->H[j], + 16); + } + } } return 0; @@ -181,6 +212,80 @@ int mbedtls_gcm_setkey(mbedtls_gcm_context *ctx, return 0; } +#if defined(MBEDTLS_GCM_LARGE_TABLE) +static const uint16_t last8[256] = { + 0x0000, 0xc201, 0x8403, 0x4602, 0x0807, 0xca06, 0x8c04, 0x4e05, + 0x100e, 0xd20f, 0x940d, 0x560c, 0x1809, 0xda08, 0x9c0a, 0x5e0b, + 0x201c, 0xe21d, 0xa41f, 0x661e, 0x281b, 0xea1a, 0xac18, 0x6e19, + 0x3012, 0xf213, 0xb411, 0x7610, 0x3815, 0xfa14, 0xbc16, 0x7e17, + 0x4038, 0x8239, 0xc43b, 0x063a, 0x483f, 0x8a3e, 0xcc3c, 0x0e3d, + 0x5036, 0x9237, 0xd435, 0x1634, 0x5831, 0x9a30, 0xdc32, 0x1e33, + 0x6024, 0xa225, 0xe427, 0x2626, 0x6823, 0xaa22, 0xec20, 0x2e21, + 0x702a, 0xb22b, 0xf429, 0x3628, 0x782d, 0xba2c, 0xfc2e, 0x3e2f, + 0x8070, 0x4271, 0x0473, 0xc672, 0x8877, 0x4a76, 0x0c74, 0xce75, + 0x907e, 0x527f, 0x147d, 0xd67c, 0x9879, 0x5a78, 0x1c7a, 0xde7b, + 0xa06c, 0x626d, 0x246f, 0xe66e, 0xa86b, 0x6a6a, 0x2c68, 0xee69, + 0xb062, 0x7263, 0x3461, 0xf660, 0xb865, 0x7a64, 0x3c66, 0xfe67, + 0xc048, 0x0249, 0x444b, 0x864a, 0xc84f, 0x0a4e, 0x4c4c, 0x8e4d, + 0xd046, 0x1247, 0x5445, 0x9644, 0xd841, 0x1a40, 0x5c42, 0x9e43, + 0xe054, 0x2255, 0x6457, 0xa656, 0xe853, 0x2a52, 0x6c50, 0xae51, + 0xf05a, 0x325b, 0x7459, 0xb658, 0xf85d, 0x3a5c, 0x7c5e, 0xbe5f, + 0x00e1, 0xc2e0, 0x84e2, 0x46e3, 0x08e6, 0xcae7, 0x8ce5, 0x4ee4, + 0x10ef, 0xd2ee, 0x94ec, 0x56ed, 0x18e8, 0xdae9, 0x9ceb, 0x5eea, + 0x20fd, 0xe2fc, 0xa4fe, 0x66ff, 0x28fa, 0xeafb, 0xacf9, 0x6ef8, + 0x30f3, 0xf2f2, 0xb4f0, 0x76f1, 0x38f4, 0xfaf5, 0xbcf7, 0x7ef6, + 0x40d9, 0x82d8, 0xc4da, 0x06db, 0x48de, 0x8adf, 0xccdd, 0x0edc, + 0x50d7, 0x92d6, 0xd4d4, 0x16d5, 0x58d0, 0x9ad1, 0xdcd3, 0x1ed2, + 0x60c5, 0xa2c4, 0xe4c6, 0x26c7, 0x68c2, 0xaac3, 0xecc1, 0x2ec0, + 0x70cb, 0xb2ca, 0xf4c8, 0x36c9, 0x78cc, 0xbacd, 0xfccf, 0x3ece, + 0x8091, 0x4290, 0x0492, 0xc693, 0x8896, 0x4a97, 0x0c95, 0xce94, + 0x909f, 0x529e, 0x149c, 0xd69d, 0x9898, 0x5a99, 0x1c9b, 0xde9a, + 0xa08d, 0x628c, 0x248e, 0xe68f, 0xa88a, 0x6a8b, 0x2c89, 0xee88, + 0xb083, 0x7282, 0x3480, 0xf681, 0xb884, 0x7a85, 0x3c87, 0xfe86, + 0xc0a9, 0x02a8, 0x44aa, 0x86ab, 0xc8ae, 0x0aaf, 0x4cad, 0x8eac, + 0xd0a7, 0x12a6, 0x54a4, 0x96a5, 0xd8a0, 0x1aa1, 0x5ca3, 0x9ea2, + 0xe0b5, 0x22b4, 0x64b6, 0xa6b7, 0xe8b2, 0x2ab3, 0x6cb1, 0xaeb0, + 0xf0bb, 0x32ba, 0x74b8, 0xb6b9, 0xf8bc, 0x3abd, 0x7cbf, 0xbebe +}; + +static void gcm_mult_largetable(uint8_t *output, const uint8_t *x, uint64_t H[256][2]) +{ + int i; + uint64_t u64z[2]; + uint16_t *u16z = (uint16_t *) u64z; + uint8_t *u8z = (uint8_t *) u64z; + uint8_t rem; + + u64z[0] = 0; + u64z[1] = 0; + + if (MBEDTLS_IS_BIG_ENDIAN) { + for (i = 15; i > 0; i--) { + mbedtls_xor_no_simd(u8z, u8z, (uint8_t *) H[x[i]], 16); + rem = u8z[15]; + + u64z[1] >>= 8; + u8z[8] = u8z[7]; + u64z[0] >>= 8; + + u16z[0] ^= MBEDTLS_GET_UINT16_LE(&last8[rem], 0); + } + } else { + for (i = 15; i > 0; i--) { + mbedtls_xor_no_simd(u8z, u8z, (uint8_t *) H[x[i]], 16); + rem = u8z[15]; + + u64z[1] <<= 8; + u8z[8] = u8z[7]; + u64z[0] <<= 8; + + u16z[0] ^= last8[rem]; + } + } + + mbedtls_xor_no_simd(output, u8z, (uint8_t *) H[x[0]], 16); +} +#else /* * Shoup's method for multiplication use this table with * last4[x] = x times P^128 @@ -194,6 +299,47 @@ static const uint16_t last4[16] = 0x9180, 0x8da0, 0xa9c0, 0xb5e0 }; +static void gcm_mult_smalltable(uint8_t *output, const uint8_t *x, uint64_t H[16][2]) +{ + int i = 0; + unsigned char lo, hi, rem; + uint64_t u64z[2]; + const uint64_t *pu64z = NULL; + uint8_t *u8z = (uint8_t *) u64z; + + lo = x[15] & 0xf; + hi = (x[15] >> 4) & 0xf; + + pu64z = H[lo]; + + rem = (unsigned char) pu64z[1] & 0xf; + u64z[1] = (pu64z[0] << 60) | (pu64z[1] >> 4); + u64z[0] = (pu64z[0] >> 4); + u64z[0] ^= (uint64_t) last4[rem] << 48; + mbedtls_xor_no_simd(u8z, u8z, (uint8_t *) H[hi], 16); + + for (i = 14; i >= 0; i--) { + lo = x[i] & 0xf; + hi = (x[i] >> 4) & 0xf; + + rem = (unsigned char) u64z[1] & 0xf; + u64z[1] = (u64z[0] << 60) | (u64z[1] >> 4); + u64z[0] = (u64z[0] >> 4); + u64z[0] ^= (uint64_t) last4[rem] << 48; + mbedtls_xor_no_simd(u8z, u8z, (uint8_t *) H[lo], 16); + + rem = (unsigned char) u64z[1] & 0xf; + u64z[1] = (u64z[0] << 60) | (u64z[1] >> 4); + u64z[0] = (u64z[0] >> 4); + u64z[0] ^= (uint64_t) last4[rem] << 48; + mbedtls_xor_no_simd(u8z, u8z, (uint8_t *) H[hi], 16); + } + + MBEDTLS_PUT_UINT64_BE(u64z[0], output, 0); + MBEDTLS_PUT_UINT64_BE(u64z[1], output, 8); +} +#endif + /* * Sets output to x times H using the precomputed tables. * x and output are seen as elements of GF(2^128) as in [MGV]. @@ -201,71 +347,31 @@ static const uint16_t last4[16] = static void gcm_mult(mbedtls_gcm_context *ctx, const unsigned char x[16], unsigned char output[16]) { - int i = 0; - unsigned char lo, hi, rem; - uint64_t zh, zl; - + switch (ctx->acceleration) { #if defined(MBEDTLS_AESNI_HAVE_CODE) - if (mbedtls_aesni_has_support(MBEDTLS_AESNI_CLMUL)) { - unsigned char h[16]; - - /* mbedtls_aesni_gcm_mult needs big-endian input */ - MBEDTLS_PUT_UINT32_BE(ctx->HH[8] >> 32, h, 0); - MBEDTLS_PUT_UINT32_BE(ctx->HH[8], h, 4); - MBEDTLS_PUT_UINT32_BE(ctx->HL[8] >> 32, h, 8); - MBEDTLS_PUT_UINT32_BE(ctx->HL[8], h, 12); - - mbedtls_aesni_gcm_mult(output, x, h); - return; - } -#endif /* MBEDTLS_AESNI_HAVE_CODE */ - -#if defined(MBEDTLS_AESCE_HAVE_CODE) - if (MBEDTLS_AESCE_HAS_SUPPORT()) { - unsigned char h[16]; - - /* mbedtls_aesce_gcm_mult needs big-endian input */ - MBEDTLS_PUT_UINT32_BE(ctx->HH[8] >> 32, h, 0); - MBEDTLS_PUT_UINT32_BE(ctx->HH[8], h, 4); - MBEDTLS_PUT_UINT32_BE(ctx->HL[8] >> 32, h, 8); - MBEDTLS_PUT_UINT32_BE(ctx->HL[8], h, 12); - - mbedtls_aesce_gcm_mult(output, x, h); - return; - } + case MBEDTLS_GCM_ACC_AESNI: + mbedtls_aesni_gcm_mult(output, x, (uint8_t *) ctx->H[MBEDTLS_GCM_HTABLE_SIZE/2]); + break; #endif - lo = x[15] & 0xf; +#if defined(MBEDTLS_AESCE_HAVE_CODE) + case MBEDTLS_GCM_ACC_AESCE: + mbedtls_aesce_gcm_mult(output, x, (uint8_t *) ctx->H[MBEDTLS_GCM_HTABLE_SIZE/2]); + break; +#endif - zh = ctx->HH[lo]; - zl = ctx->HL[lo]; - - for (i = 15; i >= 0; i--) { - lo = x[i] & 0xf; - hi = (x[i] >> 4) & 0xf; - - if (i != 15) { - rem = (unsigned char) zl & 0xf; - zl = (zh << 60) | (zl >> 4); - zh = (zh >> 4); - zh ^= (uint64_t) last4[rem] << 48; - zh ^= ctx->HH[lo]; - zl ^= ctx->HL[lo]; - - } - - rem = (unsigned char) zl & 0xf; - zl = (zh << 60) | (zl >> 4); - zh = (zh >> 4); - zh ^= (uint64_t) last4[rem] << 48; - zh ^= ctx->HH[hi]; - zl ^= ctx->HL[hi]; +#if defined(MBEDTLS_GCM_LARGE_TABLE) + case MBEDTLS_GCM_ACC_LARGETABLE: + gcm_mult_largetable(output, x, ctx->H); + break; +#else + case MBEDTLS_GCM_ACC_SMALLTABLE: + gcm_mult_smalltable(output, x, ctx->H); + break; +#endif } - MBEDTLS_PUT_UINT32_BE(zh >> 32, output, 0); - MBEDTLS_PUT_UINT32_BE(zh, output, 4); - MBEDTLS_PUT_UINT32_BE(zl >> 32, output, 8); - MBEDTLS_PUT_UINT32_BE(zl, output, 12); + return; } int mbedtls_gcm_starts(mbedtls_gcm_context *ctx, diff --git a/tests/scripts/all.sh b/tests/scripts/all.sh index f18bfadec..c25f04440 100755 --- a/tests/scripts/all.sh +++ b/tests/scripts/all.sh @@ -5036,6 +5036,19 @@ component_test_aes_only_128_bit_keys_have_builtins () { programs/test/selftest } +component_test_gcm_largetable () { + msg "build: default config + GCM_LARGE_TABLE - AESNI_C - AESCE_C" + scripts/config.py set MBEDTLS_GCM_LARGE_TABLE + scripts/config.py unset MBEDTLS_PADLOCK_C + scripts/config.py unset MBEDTLS_AESNI_C + scripts/config.py unset MBEDTLS_AESCE_C + + make CFLAGS='-O2 -Werror -Wall -Wextra' + + msg "test: default config - GCM_LARGE_TABLE - AESNI_C - AESCE_C" + make test +} + component_test_aes_fewer_tables () { msg "build: default config with AES_FEWER_TABLES enabled" scripts/config.py set MBEDTLS_AES_FEWER_TABLES