From 69b5a860644510e0315b9ec65991d5e111e81f15 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 19 Jan 2024 14:02:08 +0000 Subject: [PATCH 01/12] Improve mbedtls_xor for IAR Signed-off-by: Dave Rodgman --- library/common.h | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/library/common.h b/library/common.h index e532777e7..5c73e8a66 100644 --- a/library/common.h +++ b/library/common.h @@ -191,21 +191,30 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned uint8x16_t x = veorq_u8(v1, v2); vst1q_u8(r + i, x); } + // This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case + // where n is a constant multiple of 16. + // It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time + // constant, and very little difference if n is not a compile-time constant. + if (n % 16 != 0) #elif defined(MBEDTLS_ARCH_IS_X64) || defined(MBEDTLS_ARCH_IS_ARM64) /* This codepath probably only makes sense on architectures with 64-bit registers */ for (; (i + 8) <= n; i += 8) { uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i); mbedtls_put_unaligned_uint64(r + i, x); } + if (n % 8 != 0) #else for (; (i + 4) <= n; i += 4) { uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i); mbedtls_put_unaligned_uint32(r + i, x); } + if (n % 4 != 0) #endif #endif - for (; i < n; i++) { - r[i] = a[i] ^ b[i]; + { + for (; i < n; i++) { + r[i] = a[i] ^ b[i]; + } } } @@ -236,15 +245,23 @@ static inline void mbedtls_xor_no_simd(unsigned char *r, uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i); mbedtls_put_unaligned_uint64(r + i, x); } + // This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case + // where n is a constant multiple of 8. + // It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time + // constant, and very little difference if n is not a compile-time constant. + if (n % 8 != 0) #else for (; (i + 4) <= n; i += 4) { uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i); mbedtls_put_unaligned_uint32(r + i, x); } + if (n % 4 != 0) #endif #endif - for (; i < n; i++) { - r[i] = a[i] ^ b[i]; + { + for (; i < n; i++) { + r[i] = a[i] ^ b[i]; + } } } From 7d8c99abb08a9e0716cd9bb9747cffce1a7d235a Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 19 Jan 2024 14:02:58 +0000 Subject: [PATCH 02/12] Move MBEDTLS_COMPILER_IS_GCC defn into alignment.h Signed-off-by: Dave Rodgman --- library/alignment.h | 8 ++++++++ library/common.h | 9 --------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/library/alignment.h b/library/alignment.h index 9e1e044ec..219f4f0af 100644 --- a/library/alignment.h +++ b/library/alignment.h @@ -15,6 +15,14 @@ #include #include +#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \ + && !defined(__llvm__) && !defined(__INTEL_COMPILER) +/* Defined if the compiler really is gcc and not clang, etc */ +#define MBEDTLS_COMPILER_IS_GCC +#define MBEDTLS_GCC_VERSION \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#endif + /* * Define MBEDTLS_EFFICIENT_UNALIGNED_ACCESS for architectures where unaligned memory * accesses are known to be efficient. diff --git a/library/common.h b/library/common.h index 5c73e8a66..faefd64ea 100644 --- a/library/common.h +++ b/library/common.h @@ -27,15 +27,6 @@ #define MBEDTLS_HAVE_NEON_INTRINSICS #endif - -#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \ - && !defined(__llvm__) && !defined(__INTEL_COMPILER) -/* Defined if the compiler really is gcc and not clang, etc */ -#define MBEDTLS_COMPILER_IS_GCC -#define MBEDTLS_GCC_VERSION \ - (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) -#endif - /** Helper to define a function as static except when building invasive tests. * * If a function is only used inside its own source file and should be From c581264977e2b0309697fddc4a345ba1c4d02544 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 19 Jan 2024 14:04:28 +0000 Subject: [PATCH 03/12] Fix unaligned access on old compilers Add an alternative implementation of unaligned access that is efficient for IAR and old versions of gcc. Signed-off-by: Dave Rodgman --- library/alignment.h | 74 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/library/alignment.h b/library/alignment.h index 219f4f0af..e7318c2ac 100644 --- a/library/alignment.h +++ b/library/alignment.h @@ -45,6 +45,46 @@ #define MBEDTLS_EFFICIENT_UNALIGNED_ACCESS #endif +#if defined(__IAR_SYSTEMS_ICC__) && \ + (defined(MBEDTLS_ARCH_IS_ARM64) || defined(MBEDTLS_ARCH_IS_ARM32) \ + || defined(__ICCRX__) || defined(__ICCRL78__) || defined(__ICCRISCV__)) +#pragma language=save +#pragma language=extended +#define MBEDTLS_POP_IAR_LANGUAGE_PRAGMA +/* IAR recommend this technique for accessing unaligned data in + * https://www.iar.com/knowledge/support/technical-notes/compiler/accessing-unaligned-data + * This results in a single load / store instruction (if unaligned access is supported). + * According to that document, this is only supported on certain architectures. + */ + #define UINT_UNALIGNED +typedef uint16_t __packed mbedtls_uint16_unaligned_t; +typedef uint32_t __packed mbedtls_uint32_unaligned_t; +typedef uint64_t __packed mbedtls_uint64_unaligned_t; +#elif defined(MBEDTLS_COMPILER_IS_GCC) && (MBEDTLS_GCC_VERSION >= 40504) && \ + ((MBEDTLS_GCC_VERSION < 90300) || (!defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS))) +/* + * Old versions of gcc, depending on how the target is specified, may generate a branch to memcpy + * for calls like `memcpy(dest, src, 4)` rather than generating some LDR or LDRB instructions + * (similar for stores). + * Recent versions where unaligned access is not enabled also do this. + * + * For performance (and code size, in some cases), we want to avoid the branch and just generate + * some inline load/store instructions since the access is small and constant-size. + * + * The manual states: + * "The aligned attribute specifies a minimum alignment for the variable or structure field, + * measured in bytes." + * https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html + * + * Tested with several versions of GCC from 4.5.0 up to 9.3.0 + * We don't enable for older than 4.5.0 as this has not been tested. + */ + #define UINT_UNALIGNED +typedef uint16_t __attribute__((__aligned__(1))) mbedtls_uint16_unaligned_t; +typedef uint32_t __attribute__((__aligned__(1))) mbedtls_uint32_unaligned_t; +typedef uint64_t __attribute__((__aligned__(1))) mbedtls_uint64_unaligned_t; + #endif + /** * Read the unsigned 16 bits integer from the given address, which need not * be aligned. @@ -55,7 +95,12 @@ inline uint16_t mbedtls_get_unaligned_uint16(const void *p) { uint16_t r; +#if defined(UINT_UNALIGNED) + mbedtls_uint16_unaligned_t *p16 = (mbedtls_uint16_unaligned_t *) p; + r = *p16; +#else memcpy(&r, p, sizeof(r)); +#endif return r; } @@ -68,7 +113,12 @@ inline uint16_t mbedtls_get_unaligned_uint16(const void *p) */ inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x) { +#if defined(UINT_UNALIGNED) + mbedtls_uint16_unaligned_t *p16 = (mbedtls_uint16_unaligned_t *) p; + *p16 = x; +#else memcpy(p, &x, sizeof(x)); +#endif } /** @@ -81,7 +131,12 @@ inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x) inline uint32_t mbedtls_get_unaligned_uint32(const void *p) { uint32_t r; +#if defined(UINT_UNALIGNED) + mbedtls_uint32_unaligned_t *p32 = (mbedtls_uint32_unaligned_t *) p; + r = *p32; +#else memcpy(&r, p, sizeof(r)); +#endif return r; } @@ -94,7 +149,12 @@ inline uint32_t mbedtls_get_unaligned_uint32(const void *p) */ inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x) { +#if defined(UINT_UNALIGNED) + mbedtls_uint32_unaligned_t *p32 = (mbedtls_uint32_unaligned_t *) p; + *p32 = x; +#else memcpy(p, &x, sizeof(x)); +#endif } /** @@ -107,7 +167,12 @@ inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x) inline uint64_t mbedtls_get_unaligned_uint64(const void *p) { uint64_t r; +#if defined(UINT_UNALIGNED) + mbedtls_uint64_unaligned_t *p64 = (mbedtls_uint64_unaligned_t *) p; + r = *p64; +#else memcpy(&r, p, sizeof(r)); +#endif return r; } @@ -120,9 +185,18 @@ inline uint64_t mbedtls_get_unaligned_uint64(const void *p) */ inline void mbedtls_put_unaligned_uint64(void *p, uint64_t x) { +#if defined(UINT_UNALIGNED) + mbedtls_uint64_unaligned_t *p64 = (mbedtls_uint64_unaligned_t *) p; + *p64 = x; +#else memcpy(p, &x, sizeof(x)); +#endif } +#if defined(MBEDTLS_POP_IAR_LANGUAGE_PRAGMA) +#pragma language=restore +#endif + /** Byte Reading Macros * * Given a multi-byte integer \p x, MBEDTLS_BYTE_n retrieves the n-th From 55b5dd2cfc3c751368ddf262d7fb1b8ba7540bdc Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 19 Jan 2024 14:06:52 +0000 Subject: [PATCH 04/12] Make unaligned accessors always inline Signed-off-by: Dave Rodgman --- library/alignment.h | 48 +++++++++++++++++++++++++++++++++++------ library/platform_util.c | 12 ----------- 2 files changed, 42 insertions(+), 18 deletions(-) diff --git a/library/alignment.h b/library/alignment.h index e7318c2ac..b61301922 100644 --- a/library/alignment.h +++ b/library/alignment.h @@ -85,6 +85,12 @@ typedef uint32_t __attribute__((__aligned__(1))) mbedtls_uint32_unaligned_t; typedef uint64_t __attribute__((__aligned__(1))) mbedtls_uint64_unaligned_t; #endif +/* + * We try to force mbedtls_(get|put)_unaligned_uintXX to be always inline, because this results + * in code that is both smaller and faster. IAR and gcc both benefit from this when optimising + * for size. + */ + /** * Read the unsigned 16 bits integer from the given address, which need not * be aligned. @@ -92,7 +98,12 @@ typedef uint64_t __attribute__((__aligned__(1))) mbedtls_uint64_unaligned_t; * \param p pointer to 2 bytes of data * \return Data at the given address */ -inline uint16_t mbedtls_get_unaligned_uint16(const void *p) +#if defined(__IAR_SYSTEMS_ICC__) +#pragma inline = forced +#elif defined(__GNUC__) +__attribute__((always_inline)) +#endif +static inline uint16_t mbedtls_get_unaligned_uint16(const void *p) { uint16_t r; #if defined(UINT_UNALIGNED) @@ -111,7 +122,12 @@ inline uint16_t mbedtls_get_unaligned_uint16(const void *p) * \param p pointer to 2 bytes of data * \param x data to write */ -inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x) +#if defined(__IAR_SYSTEMS_ICC__) +#pragma inline = forced +#elif defined(__GNUC__) +__attribute__((always_inline)) +#endif +static inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x) { #if defined(UINT_UNALIGNED) mbedtls_uint16_unaligned_t *p16 = (mbedtls_uint16_unaligned_t *) p; @@ -128,7 +144,12 @@ inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x) * \param p pointer to 4 bytes of data * \return Data at the given address */ -inline uint32_t mbedtls_get_unaligned_uint32(const void *p) +#if defined(__IAR_SYSTEMS_ICC__) +#pragma inline = forced +#elif defined(__GNUC__) +__attribute__((always_inline)) +#endif +static inline uint32_t mbedtls_get_unaligned_uint32(const void *p) { uint32_t r; #if defined(UINT_UNALIGNED) @@ -147,7 +168,12 @@ inline uint32_t mbedtls_get_unaligned_uint32(const void *p) * \param p pointer to 4 bytes of data * \param x data to write */ -inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x) +#if defined(__IAR_SYSTEMS_ICC__) +#pragma inline = forced +#elif defined(__GNUC__) +__attribute__((always_inline)) +#endif +static inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x) { #if defined(UINT_UNALIGNED) mbedtls_uint32_unaligned_t *p32 = (mbedtls_uint32_unaligned_t *) p; @@ -164,7 +190,12 @@ inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x) * \param p pointer to 8 bytes of data * \return Data at the given address */ -inline uint64_t mbedtls_get_unaligned_uint64(const void *p) +#if defined(__IAR_SYSTEMS_ICC__) +#pragma inline = forced +#elif defined(__GNUC__) +__attribute__((always_inline)) +#endif +static inline uint64_t mbedtls_get_unaligned_uint64(const void *p) { uint64_t r; #if defined(UINT_UNALIGNED) @@ -183,7 +214,12 @@ inline uint64_t mbedtls_get_unaligned_uint64(const void *p) * \param p pointer to 8 bytes of data * \param x data to write */ -inline void mbedtls_put_unaligned_uint64(void *p, uint64_t x) +#if defined(__IAR_SYSTEMS_ICC__) +#pragma inline = forced +#elif defined(__GNUC__) +__attribute__((always_inline)) +#endif +static inline void mbedtls_put_unaligned_uint64(void *p, uint64_t x) { #if defined(UINT_UNALIGNED) mbedtls_uint64_unaligned_t *p64 = (mbedtls_uint64_unaligned_t *) p; diff --git a/library/platform_util.c b/library/platform_util.c index 63643d26f..e79fc5c8e 100644 --- a/library/platform_util.c +++ b/library/platform_util.c @@ -226,18 +226,6 @@ extern inline void mbedtls_xor(unsigned char *r, const unsigned char *b, size_t n); -extern inline uint16_t mbedtls_get_unaligned_uint16(const void *p); - -extern inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x); - -extern inline uint32_t mbedtls_get_unaligned_uint32(const void *p); - -extern inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x); - -extern inline uint64_t mbedtls_get_unaligned_uint64(const void *p); - -extern inline void mbedtls_put_unaligned_uint64(void *p, uint64_t x); - #if defined(MBEDTLS_HAVE_TIME) && !defined(MBEDTLS_PLATFORM_MS_TIME_ALT) #include From 18d90d75195fb56834360c42a4f06318afa3cccc Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 19 Jan 2024 14:08:04 +0000 Subject: [PATCH 05/12] Make mbedtls_xor always inline Signed-off-by: Dave Rodgman --- library/common.h | 18 +++++++++++++++++- library/platform_util.c | 9 --------- 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/library/common.h b/library/common.h index faefd64ea..760dff49e 100644 --- a/library/common.h +++ b/library/common.h @@ -158,6 +158,12 @@ static inline const unsigned char *mbedtls_buffer_offset_const( return p == NULL ? NULL : p + n; } +/* Always inline mbedtls_xor for similar reasons as mbedtls_xor_no_simd. */ +#if defined(__IAR_SYSTEMS_ICC__) +#pragma inline = forced +#elif defined(__GNUC__) +__attribute__((always_inline)) +#endif /** * Perform a fast block XOR operation, such that * r[i] = a[i] ^ b[i] where 0 <= i < n @@ -169,7 +175,10 @@ static inline const unsigned char *mbedtls_buffer_offset_const( * \param b Pointer to input (buffer of at least \p n bytes) * \param n Number of bytes to process. */ -inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n) +static inline void mbedtls_xor(unsigned char *r, + const unsigned char *a, + const unsigned char *b, + size_t n) { size_t i = 0; #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS) @@ -209,6 +218,13 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned } } +/* Always inline mbedtls_xor_no_simd as we see significant perf regressions when it does not get + * inlined (e.g., observed about 3x perf difference in gcm_mult_largetable with gcc 7 - 12) */ +#if defined(__IAR_SYSTEMS_ICC__) +#pragma inline = forced +#elif defined(__GNUC__) +__attribute__((always_inline)) +#endif /** * Perform a fast block XOR operation, such that * r[i] = a[i] ^ b[i] where 0 <= i < n diff --git a/library/platform_util.c b/library/platform_util.c index e79fc5c8e..9f5dcb874 100644 --- a/library/platform_util.c +++ b/library/platform_util.c @@ -217,15 +217,6 @@ struct tm *mbedtls_platform_gmtime_r(const mbedtls_time_t *tt, void (*mbedtls_test_hook_test_fail)(const char *, int, const char *); #endif /* MBEDTLS_TEST_HOOKS */ -/* - * Provide external definitions of some inline functions so that the compiler - * has the option to not inline them - */ -extern inline void mbedtls_xor(unsigned char *r, - const unsigned char *a, - const unsigned char *b, - size_t n); - #if defined(MBEDTLS_HAVE_TIME) && !defined(MBEDTLS_PLATFORM_MS_TIME_ALT) #include From 2143a4ad1fc79e3e601e078f86aafcb6fbabcc71 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 19 Jan 2024 14:08:17 +0000 Subject: [PATCH 06/12] Improve mbedtls_xor docs Signed-off-by: Dave Rodgman --- library/common.h | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/library/common.h b/library/common.h index 760dff49e..3b1c7e1e7 100644 --- a/library/common.h +++ b/library/common.h @@ -174,6 +174,14 @@ __attribute__((always_inline)) * \param a Pointer to input (buffer of at least \p n bytes) * \param b Pointer to input (buffer of at least \p n bytes) * \param n Number of bytes to process. + * + * \note Depending on the situation, it may be faster to use either mbedtls_xor or + * mbedtls_xor_no_simd (these are functionally equivalent). + * If the result is used immediately after the xor operation in non-SIMD code (e.g, in + * AES-CBC), there may be additional latency to transfer the data from SIMD to scalar + * registers, and in this case, mbedtls_xor_no_simd may be faster. In other cases where + * the result is not used immediately (e.g., in AES-CTR), mbedtls_xor may be faster. + * For targets without SIMD support, they will behave the same. */ static inline void mbedtls_xor(unsigned char *r, const unsigned char *a, @@ -238,6 +246,14 @@ __attribute__((always_inline)) * \param a Pointer to input (buffer of at least \p n bytes) * \param b Pointer to input (buffer of at least \p n bytes) * \param n Number of bytes to process. + * + * \note Depending on the situation, it may be faster to use either mbedtls_xor or + * mbedtls_xor_no_simd (these are functionally equivalent). + * If the result is used immediately after the xor operation in non-SIMD code (e.g, in + * AES-CBC), there may be additional latency to transfer the data from SIMD to scalar + * registers, and in this case, mbedtls_xor_no_simd may be faster. In other cases where + * the result is not used immediately (e.g., in AES-CTR), mbedtls_xor may be faster. + * For targets without SIMD support, they will behave the same. */ static inline void mbedtls_xor_no_simd(unsigned char *r, const unsigned char *a, From 7470557855e5eecb74064a7e1773995e03bd622a Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 19 Jan 2024 14:29:32 +0000 Subject: [PATCH 07/12] Add changelog entry Signed-off-by: Dave Rodgman --- ChangeLog.d/iar-gcc-perf.txt | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 ChangeLog.d/iar-gcc-perf.txt diff --git a/ChangeLog.d/iar-gcc-perf.txt b/ChangeLog.d/iar-gcc-perf.txt new file mode 100644 index 000000000..fb0fbb10d --- /dev/null +++ b/ChangeLog.d/iar-gcc-perf.txt @@ -0,0 +1,2 @@ +Features + * Improve performance for gcc (versions older than 9.3.0) and IAR. From 00b4eeb0b3b8569ee371dd91a0a6fac6ebc0ee34 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 19 Jan 2024 16:06:41 +0000 Subject: [PATCH 08/12] Improve comments Signed-off-by: Dave Rodgman --- library/common.h | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/library/common.h b/library/common.h index 3b1c7e1e7..2eb917037 100644 --- a/library/common.h +++ b/library/common.h @@ -158,7 +158,7 @@ static inline const unsigned char *mbedtls_buffer_offset_const( return p == NULL ? NULL : p + n; } -/* Always inline mbedtls_xor for similar reasons as mbedtls_xor_no_simd. */ +/* Always inline mbedtls_xor() for similar reasons as mbedtls_xor_no_simd(). */ #if defined(__IAR_SYSTEMS_ICC__) #pragma inline = forced #elif defined(__GNUC__) @@ -175,12 +175,12 @@ __attribute__((always_inline)) * \param b Pointer to input (buffer of at least \p n bytes) * \param n Number of bytes to process. * - * \note Depending on the situation, it may be faster to use either mbedtls_xor or - * mbedtls_xor_no_simd (these are functionally equivalent). + * \note Depending on the situation, it may be faster to use either mbedtls_xor() or + * mbedtls_xor_no_simd() (these are functionally equivalent). * If the result is used immediately after the xor operation in non-SIMD code (e.g, in * AES-CBC), there may be additional latency to transfer the data from SIMD to scalar - * registers, and in this case, mbedtls_xor_no_simd may be faster. In other cases where - * the result is not used immediately (e.g., in AES-CTR), mbedtls_xor may be faster. + * registers, and in this case, mbedtls_xor_no_simd() may be faster. In other cases where + * the result is not used immediately (e.g., in AES-CTR), mbedtls_xor() may be faster. * For targets without SIMD support, they will behave the same. */ static inline void mbedtls_xor(unsigned char *r, @@ -199,10 +199,10 @@ static inline void mbedtls_xor(unsigned char *r, uint8x16_t x = veorq_u8(v1, v2); vst1q_u8(r + i, x); } - // This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case - // where n is a constant multiple of 16. - // It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time - // constant, and very little difference if n is not a compile-time constant. + /* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case + * where n is a constant multiple of 16. + * It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time + * constant, and very little difference if n is not a compile-time constant. */ if (n % 16 != 0) #elif defined(MBEDTLS_ARCH_IS_X64) || defined(MBEDTLS_ARCH_IS_ARM64) /* This codepath probably only makes sense on architectures with 64-bit registers */ @@ -226,7 +226,7 @@ static inline void mbedtls_xor(unsigned char *r, } } -/* Always inline mbedtls_xor_no_simd as we see significant perf regressions when it does not get +/* Always inline mbedtls_xor_no_simd() as we see significant perf regressions when it does not get * inlined (e.g., observed about 3x perf difference in gcm_mult_largetable with gcc 7 - 12) */ #if defined(__IAR_SYSTEMS_ICC__) #pragma inline = forced @@ -237,7 +237,7 @@ __attribute__((always_inline)) * Perform a fast block XOR operation, such that * r[i] = a[i] ^ b[i] where 0 <= i < n * - * In some situations, this can perform better than mbedtls_xor (e.g., it's about 5% + * In some situations, this can perform better than mbedtls_xor() (e.g., it's about 5% * better in AES-CBC). * * \param r Pointer to result (buffer of at least \p n bytes). \p r @@ -247,12 +247,12 @@ __attribute__((always_inline)) * \param b Pointer to input (buffer of at least \p n bytes) * \param n Number of bytes to process. * - * \note Depending on the situation, it may be faster to use either mbedtls_xor or - * mbedtls_xor_no_simd (these are functionally equivalent). + * \note Depending on the situation, it may be faster to use either mbedtls_xor() or + * mbedtls_xor_no_simd() (these are functionally equivalent). * If the result is used immediately after the xor operation in non-SIMD code (e.g, in * AES-CBC), there may be additional latency to transfer the data from SIMD to scalar - * registers, and in this case, mbedtls_xor_no_simd may be faster. In other cases where - * the result is not used immediately (e.g., in AES-CTR), mbedtls_xor may be faster. + * registers, and in this case, mbedtls_xor_no_simd() may be faster. In other cases where + * the result is not used immediately (e.g., in AES-CTR), mbedtls_xor() may be faster. * For targets without SIMD support, they will behave the same. */ static inline void mbedtls_xor_no_simd(unsigned char *r, @@ -268,10 +268,10 @@ static inline void mbedtls_xor_no_simd(unsigned char *r, uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i); mbedtls_put_unaligned_uint64(r + i, x); } - // This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case - // where n is a constant multiple of 8. - // It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time - // constant, and very little difference if n is not a compile-time constant. + /* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case + * where n is a constant multiple of 16. + * It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time + * constant, and very little difference if n is not a compile-time constant. */ if (n % 8 != 0) #else for (; (i + 4) <= n; i += 4) { From 336efeec50cc237a7c1e03a0744d3188a7f805fd Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 19 Jan 2024 16:38:53 +0000 Subject: [PATCH 09/12] Move MBEDTLS_COMPILER_IS_GCC & MBEDTLS_GCC_VERSION into build_info Signed-off-by: Dave Rodgman --- include/mbedtls/build_info.h | 8 ++++++++ library/alignment.h | 8 +------- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/include/mbedtls/build_info.h b/include/mbedtls/build_info.h index 7a70e2543..c0b724c83 100644 --- a/include/mbedtls/build_info.h +++ b/include/mbedtls/build_info.h @@ -83,6 +83,14 @@ #endif #endif +#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \ + && !defined(__llvm__) && !defined(__INTEL_COMPILER) +/* Defined if the compiler really is gcc and not clang, etc */ +#define MBEDTLS_COMPILER_IS_GCC +#define MBEDTLS_GCC_VERSION \ + (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) +#endif + #if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_DEPRECATE) #define _CRT_SECURE_NO_DEPRECATE 1 #endif diff --git a/library/alignment.h b/library/alignment.h index b61301922..26f15261c 100644 --- a/library/alignment.h +++ b/library/alignment.h @@ -15,13 +15,7 @@ #include #include -#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \ - && !defined(__llvm__) && !defined(__INTEL_COMPILER) -/* Defined if the compiler really is gcc and not clang, etc */ -#define MBEDTLS_COMPILER_IS_GCC -#define MBEDTLS_GCC_VERSION \ - (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__) -#endif +#include "mbedtls/build_info.h" /* * Define MBEDTLS_EFFICIENT_UNALIGNED_ACCESS for architectures where unaligned memory From 075f8797ac35925089116959c98366ccd2cb00e6 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Fri, 19 Jan 2024 16:48:42 +0000 Subject: [PATCH 10/12] Remove include of build_info.h Signed-off-by: Dave Rodgman --- library/alignment.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/library/alignment.h b/library/alignment.h index 26f15261c..248f29bc7 100644 --- a/library/alignment.h +++ b/library/alignment.h @@ -15,8 +15,6 @@ #include #include -#include "mbedtls/build_info.h" - /* * Define MBEDTLS_EFFICIENT_UNALIGNED_ACCESS for architectures where unaligned memory * accesses are known to be efficient. From 00b530e3957061a06663e1785dc923ee0b7e7c95 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 23 Jan 2024 09:36:34 +0000 Subject: [PATCH 11/12] Limit compiler hint to compilers that are known to benefit from it Signed-off-by: Dave Rodgman --- library/common.h | 50 ++++++++++++++++++++++++++++++++---------------- 1 file changed, 33 insertions(+), 17 deletions(-) diff --git a/library/common.h b/library/common.h index 2eb917037..937c80284 100644 --- a/library/common.h +++ b/library/common.h @@ -199,30 +199,40 @@ static inline void mbedtls_xor(unsigned char *r, uint8x16_t x = veorq_u8(v1, v2); vst1q_u8(r + i, x); } +#if defined(__IAR_SYSTEMS_ICC__) /* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case * where n is a constant multiple of 16. - * It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time - * constant, and very little difference if n is not a compile-time constant. */ - if (n % 16 != 0) + * For other compilers (e.g. recent gcc and clang) it makes no difference if n is a compile-time + * constant, and is a very small perf regression if n is not a compile-time constant. */ + if (n % 16 == 0) { + return; + } +#endif #elif defined(MBEDTLS_ARCH_IS_X64) || defined(MBEDTLS_ARCH_IS_ARM64) /* This codepath probably only makes sense on architectures with 64-bit registers */ for (; (i + 8) <= n; i += 8) { uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i); mbedtls_put_unaligned_uint64(r + i, x); } - if (n % 8 != 0) +#if defined(__IAR_SYSTEMS_ICC__) + if (n % 8 == 0) { + return; + } +#endif #else for (; (i + 4) <= n; i += 4) { uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i); mbedtls_put_unaligned_uint32(r + i, x); } - if (n % 4 != 0) +#if defined(__IAR_SYSTEMS_ICC__) + if (n % 4 == 0) { + return; + } #endif #endif - { - for (; i < n; i++) { - r[i] = a[i] ^ b[i]; - } +#endif + for (; i < n; i++) { + r[i] = a[i] ^ b[i]; } } @@ -268,23 +278,29 @@ static inline void mbedtls_xor_no_simd(unsigned char *r, uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i); mbedtls_put_unaligned_uint64(r + i, x); } +#if defined(__IAR_SYSTEMS_ICC__) /* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case * where n is a constant multiple of 16. - * It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time - * constant, and very little difference if n is not a compile-time constant. */ - if (n % 8 != 0) + * For other compilers (e.g. recent gcc and clang) it makes no difference if n is a compile-time + * constant, and is a very small perf regression if n is not a compile-time constant. */ + if (n % 8 == 0) { + return; + } +#endif #else for (; (i + 4) <= n; i += 4) { uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i); mbedtls_put_unaligned_uint32(r + i, x); } - if (n % 4 != 0) +#if defined(__IAR_SYSTEMS_ICC__) + if (n % 4 == 0) { + return; + } #endif #endif - { - for (; i < n; i++) { - r[i] = a[i] ^ b[i]; - } +#endif + for (; i < n; i++) { + r[i] = a[i] ^ b[i]; } } From c64280a2d71f6e88835787b2121857f063af7029 Mon Sep 17 00:00:00 2001 From: Dave Rodgman Date: Tue, 23 Jan 2024 10:03:22 +0000 Subject: [PATCH 12/12] Fix comment typo Signed-off-by: Dave Rodgman --- library/common.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/library/common.h b/library/common.h index 937c80284..3936ffdfe 100644 --- a/library/common.h +++ b/library/common.h @@ -280,7 +280,7 @@ static inline void mbedtls_xor_no_simd(unsigned char *r, } #if defined(__IAR_SYSTEMS_ICC__) /* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case - * where n is a constant multiple of 16. + * where n is a constant multiple of 8. * For other compilers (e.g. recent gcc and clang) it makes no difference if n is a compile-time * constant, and is a very small perf regression if n is not a compile-time constant. */ if (n % 8 == 0) {