From 69b5a860644510e0315b9ec65991d5e111e81f15 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 19 Jan 2024 14:02:08 +0000
Subject: [PATCH 01/12] Improve mbedtls_xor for IAR

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/common.h | 25 +++++++++++++++++++++----
 1 file changed, 21 insertions(+), 4 deletions(-)

diff --git a/library/common.h b/library/common.h
index e532777e7..5c73e8a66 100644
--- a/library/common.h
+++ b/library/common.h
@@ -191,21 +191,30 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
         uint8x16_t x = veorq_u8(v1, v2);
         vst1q_u8(r + i, x);
     }
+    // This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case
+    // where n is a constant multiple of 16.
+    // It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time
+    // constant, and very little difference if n is not a compile-time constant.
+    if (n % 16 != 0)
 #elif defined(MBEDTLS_ARCH_IS_X64) || defined(MBEDTLS_ARCH_IS_ARM64)
     /* This codepath probably only makes sense on architectures with 64-bit registers */
     for (; (i + 8) <= n; i += 8) {
         uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
         mbedtls_put_unaligned_uint64(r + i, x);
     }
+    if (n % 8 != 0)
 #else
     for (; (i + 4) <= n; i += 4) {
         uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
         mbedtls_put_unaligned_uint32(r + i, x);
     }
+    if (n % 4 != 0)
 #endif
 #endif
-    for (; i < n; i++) {
-        r[i] = a[i] ^ b[i];
+    {
+        for (; i < n; i++) {
+            r[i] = a[i] ^ b[i];
+        }
     }
 }
 
@@ -236,15 +245,23 @@ static inline void mbedtls_xor_no_simd(unsigned char *r,
         uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
         mbedtls_put_unaligned_uint64(r + i, x);
     }
+    // This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case
+    // where n is a constant multiple of 8.
+    // It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time
+    // constant, and very little difference if n is not a compile-time constant.
+    if (n % 8 != 0)
 #else
     for (; (i + 4) <= n; i += 4) {
         uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
         mbedtls_put_unaligned_uint32(r + i, x);
     }
+    if (n % 4 != 0)
 #endif
 #endif
-    for (; i < n; i++) {
-        r[i] = a[i] ^ b[i];
+    {
+        for (; i < n; i++) {
+            r[i] = a[i] ^ b[i];
+        }
     }
 }
 

From 7d8c99abb08a9e0716cd9bb9747cffce1a7d235a Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 19 Jan 2024 14:02:58 +0000
Subject: [PATCH 02/12] Move MBEDTLS_COMPILER_IS_GCC defn into alignment.h

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/alignment.h | 8 ++++++++
 library/common.h    | 9 ---------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/library/alignment.h b/library/alignment.h
index 9e1e044ec..219f4f0af 100644
--- a/library/alignment.h
+++ b/library/alignment.h
@@ -15,6 +15,14 @@
 #include <string.h>
 #include <stdlib.h>
 
+#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \
+    && !defined(__llvm__) && !defined(__INTEL_COMPILER)
+/* Defined if the compiler really is gcc and not clang, etc */
+#define MBEDTLS_COMPILER_IS_GCC
+#define MBEDTLS_GCC_VERSION \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
 /*
  * Define MBEDTLS_EFFICIENT_UNALIGNED_ACCESS for architectures where unaligned memory
  * accesses are known to be efficient.
diff --git a/library/common.h b/library/common.h
index 5c73e8a66..faefd64ea 100644
--- a/library/common.h
+++ b/library/common.h
@@ -27,15 +27,6 @@
 #define MBEDTLS_HAVE_NEON_INTRINSICS
 #endif
 
-
-#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \
-    && !defined(__llvm__) && !defined(__INTEL_COMPILER)
-/* Defined if the compiler really is gcc and not clang, etc */
-#define MBEDTLS_COMPILER_IS_GCC
-#define MBEDTLS_GCC_VERSION \
-    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#endif
-
 /** Helper to define a function as static except when building invasive tests.
  *
  * If a function is only used inside its own source file and should be

From c581264977e2b0309697fddc4a345ba1c4d02544 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 19 Jan 2024 14:04:28 +0000
Subject: [PATCH 03/12] Fix unaligned access on old compilers

Add an alternative implementation of unaligned access that is efficient
for IAR and old versions of gcc.

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/alignment.h | 74 +++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 74 insertions(+)

diff --git a/library/alignment.h b/library/alignment.h
index 219f4f0af..e7318c2ac 100644
--- a/library/alignment.h
+++ b/library/alignment.h
@@ -45,6 +45,46 @@
 #define MBEDTLS_EFFICIENT_UNALIGNED_ACCESS
 #endif
 
+#if defined(__IAR_SYSTEMS_ICC__) && \
+    (defined(MBEDTLS_ARCH_IS_ARM64) || defined(MBEDTLS_ARCH_IS_ARM32) \
+    || defined(__ICCRX__) || defined(__ICCRL78__) || defined(__ICCRISCV__))
+#pragma language=save
+#pragma language=extended
+#define MBEDTLS_POP_IAR_LANGUAGE_PRAGMA
+/* IAR recommend this technique for accessing unaligned data in
+ * https://www.iar.com/knowledge/support/technical-notes/compiler/accessing-unaligned-data
+ * This results in a single load / store instruction (if unaligned access is supported).
+ * According to that document, this is only supported on certain architectures.
+ */
+    #define UINT_UNALIGNED
+typedef uint16_t __packed mbedtls_uint16_unaligned_t;
+typedef uint32_t __packed mbedtls_uint32_unaligned_t;
+typedef uint64_t __packed mbedtls_uint64_unaligned_t;
+#elif defined(MBEDTLS_COMPILER_IS_GCC) && (MBEDTLS_GCC_VERSION >= 40504) && \
+    ((MBEDTLS_GCC_VERSION < 90300) || (!defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)))
+/*
+ * Old versions of gcc, depending on how the target is specified, may generate a branch to memcpy
+ * for calls like `memcpy(dest, src, 4)` rather than generating some LDR or LDRB instructions
+ * (similar for stores).
+ * Recent versions where unaligned access is not enabled also do this.
+ *
+ * For performance (and code size, in some cases), we want to avoid the branch and just generate
+ * some inline load/store instructions since the access is small and constant-size.
+ *
+ * The manual states:
+ * "The aligned attribute specifies a minimum alignment for the variable or structure field,
+ * measured in bytes."
+ * https://gcc.gnu.org/onlinedocs/gcc/Common-Variable-Attributes.html
+ *
+ * Tested with several versions of GCC from 4.5.0 up to 9.3.0
+ * We don't enable for older than 4.5.0 as this has not been tested.
+ */
+ #define UINT_UNALIGNED
+typedef uint16_t __attribute__((__aligned__(1))) mbedtls_uint16_unaligned_t;
+typedef uint32_t __attribute__((__aligned__(1))) mbedtls_uint32_unaligned_t;
+typedef uint64_t __attribute__((__aligned__(1))) mbedtls_uint64_unaligned_t;
+ #endif
+
 /**
  * Read the unsigned 16 bits integer from the given address, which need not
  * be aligned.
@@ -55,7 +95,12 @@
 inline uint16_t mbedtls_get_unaligned_uint16(const void *p)
 {
     uint16_t r;
+#if defined(UINT_UNALIGNED)
+    mbedtls_uint16_unaligned_t *p16 = (mbedtls_uint16_unaligned_t *) p;
+    r = *p16;
+#else
     memcpy(&r, p, sizeof(r));
+#endif
     return r;
 }
 
@@ -68,7 +113,12 @@ inline uint16_t mbedtls_get_unaligned_uint16(const void *p)
  */
 inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x)
 {
+#if defined(UINT_UNALIGNED)
+    mbedtls_uint16_unaligned_t *p16 = (mbedtls_uint16_unaligned_t *) p;
+    *p16 = x;
+#else
     memcpy(p, &x, sizeof(x));
+#endif
 }
 
 /**
@@ -81,7 +131,12 @@ inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x)
 inline uint32_t mbedtls_get_unaligned_uint32(const void *p)
 {
     uint32_t r;
+#if defined(UINT_UNALIGNED)
+    mbedtls_uint32_unaligned_t *p32 = (mbedtls_uint32_unaligned_t *) p;
+    r = *p32;
+#else
     memcpy(&r, p, sizeof(r));
+#endif
     return r;
 }
 
@@ -94,7 +149,12 @@ inline uint32_t mbedtls_get_unaligned_uint32(const void *p)
  */
 inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x)
 {
+#if defined(UINT_UNALIGNED)
+    mbedtls_uint32_unaligned_t *p32 = (mbedtls_uint32_unaligned_t *) p;
+    *p32 = x;
+#else
     memcpy(p, &x, sizeof(x));
+#endif
 }
 
 /**
@@ -107,7 +167,12 @@ inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x)
 inline uint64_t mbedtls_get_unaligned_uint64(const void *p)
 {
     uint64_t r;
+#if defined(UINT_UNALIGNED)
+    mbedtls_uint64_unaligned_t *p64 = (mbedtls_uint64_unaligned_t *) p;
+    r = *p64;
+#else
     memcpy(&r, p, sizeof(r));
+#endif
     return r;
 }
 
@@ -120,9 +185,18 @@ inline uint64_t mbedtls_get_unaligned_uint64(const void *p)
  */
 inline void mbedtls_put_unaligned_uint64(void *p, uint64_t x)
 {
+#if defined(UINT_UNALIGNED)
+    mbedtls_uint64_unaligned_t *p64 = (mbedtls_uint64_unaligned_t *) p;
+    *p64 = x;
+#else
     memcpy(p, &x, sizeof(x));
+#endif
 }
 
+#if defined(MBEDTLS_POP_IAR_LANGUAGE_PRAGMA)
+#pragma language=restore
+#endif
+
 /** Byte Reading Macros
  *
  * Given a multi-byte integer \p x, MBEDTLS_BYTE_n retrieves the n-th

From 55b5dd2cfc3c751368ddf262d7fb1b8ba7540bdc Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 19 Jan 2024 14:06:52 +0000
Subject: [PATCH 04/12] Make unaligned accessors always inline

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/alignment.h     | 48 +++++++++++++++++++++++++++++++++++------
 library/platform_util.c | 12 -----------
 2 files changed, 42 insertions(+), 18 deletions(-)

diff --git a/library/alignment.h b/library/alignment.h
index e7318c2ac..b61301922 100644
--- a/library/alignment.h
+++ b/library/alignment.h
@@ -85,6 +85,12 @@ typedef uint32_t __attribute__((__aligned__(1))) mbedtls_uint32_unaligned_t;
 typedef uint64_t __attribute__((__aligned__(1))) mbedtls_uint64_unaligned_t;
  #endif
 
+/*
+ * We try to force mbedtls_(get|put)_unaligned_uintXX to be always inline, because this results
+ * in code that is both smaller and faster. IAR and gcc both benefit from this when optimising
+ * for size.
+ */
+
 /**
  * Read the unsigned 16 bits integer from the given address, which need not
  * be aligned.
@@ -92,7 +98,12 @@ typedef uint64_t __attribute__((__aligned__(1))) mbedtls_uint64_unaligned_t;
  * \param   p pointer to 2 bytes of data
  * \return  Data at the given address
  */
-inline uint16_t mbedtls_get_unaligned_uint16(const void *p)
+#if defined(__IAR_SYSTEMS_ICC__)
+#pragma inline = forced
+#elif defined(__GNUC__)
+__attribute__((always_inline))
+#endif
+static inline uint16_t mbedtls_get_unaligned_uint16(const void *p)
 {
     uint16_t r;
 #if defined(UINT_UNALIGNED)
@@ -111,7 +122,12 @@ inline uint16_t mbedtls_get_unaligned_uint16(const void *p)
  * \param   p pointer to 2 bytes of data
  * \param   x data to write
  */
-inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x)
+#if defined(__IAR_SYSTEMS_ICC__)
+#pragma inline = forced
+#elif defined(__GNUC__)
+__attribute__((always_inline))
+#endif
+static inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x)
 {
 #if defined(UINT_UNALIGNED)
     mbedtls_uint16_unaligned_t *p16 = (mbedtls_uint16_unaligned_t *) p;
@@ -128,7 +144,12 @@ inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x)
  * \param   p pointer to 4 bytes of data
  * \return  Data at the given address
  */
-inline uint32_t mbedtls_get_unaligned_uint32(const void *p)
+#if defined(__IAR_SYSTEMS_ICC__)
+#pragma inline = forced
+#elif defined(__GNUC__)
+__attribute__((always_inline))
+#endif
+static inline uint32_t mbedtls_get_unaligned_uint32(const void *p)
 {
     uint32_t r;
 #if defined(UINT_UNALIGNED)
@@ -147,7 +168,12 @@ inline uint32_t mbedtls_get_unaligned_uint32(const void *p)
  * \param   p pointer to 4 bytes of data
  * \param   x data to write
  */
-inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x)
+#if defined(__IAR_SYSTEMS_ICC__)
+#pragma inline = forced
+#elif defined(__GNUC__)
+__attribute__((always_inline))
+#endif
+static inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x)
 {
 #if defined(UINT_UNALIGNED)
     mbedtls_uint32_unaligned_t *p32 = (mbedtls_uint32_unaligned_t *) p;
@@ -164,7 +190,12 @@ inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x)
  * \param   p pointer to 8 bytes of data
  * \return  Data at the given address
  */
-inline uint64_t mbedtls_get_unaligned_uint64(const void *p)
+#if defined(__IAR_SYSTEMS_ICC__)
+#pragma inline = forced
+#elif defined(__GNUC__)
+__attribute__((always_inline))
+#endif
+static inline uint64_t mbedtls_get_unaligned_uint64(const void *p)
 {
     uint64_t r;
 #if defined(UINT_UNALIGNED)
@@ -183,7 +214,12 @@ inline uint64_t mbedtls_get_unaligned_uint64(const void *p)
  * \param   p pointer to 8 bytes of data
  * \param   x data to write
  */
-inline void mbedtls_put_unaligned_uint64(void *p, uint64_t x)
+#if defined(__IAR_SYSTEMS_ICC__)
+#pragma inline = forced
+#elif defined(__GNUC__)
+__attribute__((always_inline))
+#endif
+static inline void mbedtls_put_unaligned_uint64(void *p, uint64_t x)
 {
 #if defined(UINT_UNALIGNED)
     mbedtls_uint64_unaligned_t *p64 = (mbedtls_uint64_unaligned_t *) p;
diff --git a/library/platform_util.c b/library/platform_util.c
index 63643d26f..e79fc5c8e 100644
--- a/library/platform_util.c
+++ b/library/platform_util.c
@@ -226,18 +226,6 @@ extern inline void mbedtls_xor(unsigned char *r,
                                const unsigned char *b,
                                size_t n);
 
-extern inline uint16_t mbedtls_get_unaligned_uint16(const void *p);
-
-extern inline void mbedtls_put_unaligned_uint16(void *p, uint16_t x);
-
-extern inline uint32_t mbedtls_get_unaligned_uint32(const void *p);
-
-extern inline void mbedtls_put_unaligned_uint32(void *p, uint32_t x);
-
-extern inline uint64_t mbedtls_get_unaligned_uint64(const void *p);
-
-extern inline void mbedtls_put_unaligned_uint64(void *p, uint64_t x);
-
 #if defined(MBEDTLS_HAVE_TIME) && !defined(MBEDTLS_PLATFORM_MS_TIME_ALT)
 
 #include <time.h>

From 18d90d75195fb56834360c42a4f06318afa3cccc Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 19 Jan 2024 14:08:04 +0000
Subject: [PATCH 05/12] Make mbedtls_xor always inline

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/common.h        | 18 +++++++++++++++++-
 library/platform_util.c |  9 ---------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/library/common.h b/library/common.h
index faefd64ea..760dff49e 100644
--- a/library/common.h
+++ b/library/common.h
@@ -158,6 +158,12 @@ static inline const unsigned char *mbedtls_buffer_offset_const(
     return p == NULL ? NULL : p + n;
 }
 
+/* Always inline mbedtls_xor for similar reasons as mbedtls_xor_no_simd. */
+#if defined(__IAR_SYSTEMS_ICC__)
+#pragma inline = forced
+#elif defined(__GNUC__)
+__attribute__((always_inline))
+#endif
 /**
  * Perform a fast block XOR operation, such that
  * r[i] = a[i] ^ b[i] where 0 <= i < n
@@ -169,7 +175,10 @@ static inline const unsigned char *mbedtls_buffer_offset_const(
  * \param   b Pointer to input (buffer of at least \p n bytes)
  * \param   n Number of bytes to process.
  */
-inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned char *b, size_t n)
+static inline void mbedtls_xor(unsigned char *r,
+                               const unsigned char *a,
+                               const unsigned char *b,
+                               size_t n)
 {
     size_t i = 0;
 #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
@@ -209,6 +218,13 @@ inline void mbedtls_xor(unsigned char *r, const unsigned char *a, const unsigned
     }
 }
 
+/* Always inline mbedtls_xor_no_simd as we see significant perf regressions when it does not get
+ * inlined (e.g., observed about 3x perf difference in gcm_mult_largetable with gcc 7 - 12) */
+#if defined(__IAR_SYSTEMS_ICC__)
+#pragma inline = forced
+#elif defined(__GNUC__)
+__attribute__((always_inline))
+#endif
 /**
  * Perform a fast block XOR operation, such that
  * r[i] = a[i] ^ b[i] where 0 <= i < n
diff --git a/library/platform_util.c b/library/platform_util.c
index e79fc5c8e..9f5dcb874 100644
--- a/library/platform_util.c
+++ b/library/platform_util.c
@@ -217,15 +217,6 @@ struct tm *mbedtls_platform_gmtime_r(const mbedtls_time_t *tt,
 void (*mbedtls_test_hook_test_fail)(const char *, int, const char *);
 #endif /* MBEDTLS_TEST_HOOKS */
 
-/*
- * Provide external definitions of some inline functions so that the compiler
- * has the option to not inline them
- */
-extern inline void mbedtls_xor(unsigned char *r,
-                               const unsigned char *a,
-                               const unsigned char *b,
-                               size_t n);
-
 #if defined(MBEDTLS_HAVE_TIME) && !defined(MBEDTLS_PLATFORM_MS_TIME_ALT)
 
 #include <time.h>

From 2143a4ad1fc79e3e601e078f86aafcb6fbabcc71 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 19 Jan 2024 14:08:17 +0000
Subject: [PATCH 06/12] Improve mbedtls_xor docs

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/common.h | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/library/common.h b/library/common.h
index 760dff49e..3b1c7e1e7 100644
--- a/library/common.h
+++ b/library/common.h
@@ -174,6 +174,14 @@ __attribute__((always_inline))
  * \param   a Pointer to input (buffer of at least \p n bytes)
  * \param   b Pointer to input (buffer of at least \p n bytes)
  * \param   n Number of bytes to process.
+ *
+ * \note      Depending on the situation, it may be faster to use either mbedtls_xor or
+ *            mbedtls_xor_no_simd (these are functionally equivalent).
+ *            If the result is used immediately after the xor operation in non-SIMD code (e.g, in
+ *            AES-CBC), there may be additional latency to transfer the data from SIMD to scalar
+ *            registers, and in this case, mbedtls_xor_no_simd may be faster. In other cases where
+ *            the result is not used immediately (e.g., in AES-CTR), mbedtls_xor may be faster.
+ *            For targets without SIMD support, they will behave the same.
  */
 static inline void mbedtls_xor(unsigned char *r,
                                const unsigned char *a,
@@ -238,6 +246,14 @@ __attribute__((always_inline))
  * \param   a Pointer to input (buffer of at least \p n bytes)
  * \param   b Pointer to input (buffer of at least \p n bytes)
  * \param   n Number of bytes to process.
+ *
+ * \note      Depending on the situation, it may be faster to use either mbedtls_xor or
+ *            mbedtls_xor_no_simd (these are functionally equivalent).
+ *            If the result is used immediately after the xor operation in non-SIMD code (e.g, in
+ *            AES-CBC), there may be additional latency to transfer the data from SIMD to scalar
+ *            registers, and in this case, mbedtls_xor_no_simd may be faster. In other cases where
+ *            the result is not used immediately (e.g., in AES-CTR), mbedtls_xor may be faster.
+ *            For targets without SIMD support, they will behave the same.
  */
 static inline void mbedtls_xor_no_simd(unsigned char *r,
                                        const unsigned char *a,

From 7470557855e5eecb74064a7e1773995e03bd622a Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 19 Jan 2024 14:29:32 +0000
Subject: [PATCH 07/12] Add changelog entry

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 ChangeLog.d/iar-gcc-perf.txt | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 ChangeLog.d/iar-gcc-perf.txt

diff --git a/ChangeLog.d/iar-gcc-perf.txt b/ChangeLog.d/iar-gcc-perf.txt
new file mode 100644
index 000000000..fb0fbb10d
--- /dev/null
+++ b/ChangeLog.d/iar-gcc-perf.txt
@@ -0,0 +1,2 @@
+Features
+   * Improve performance for gcc (versions older than 9.3.0) and IAR.

From 00b4eeb0b3b8569ee371dd91a0a6fac6ebc0ee34 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 19 Jan 2024 16:06:41 +0000
Subject: [PATCH 08/12] Improve comments

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/common.h | 38 +++++++++++++++++++-------------------
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/library/common.h b/library/common.h
index 3b1c7e1e7..2eb917037 100644
--- a/library/common.h
+++ b/library/common.h
@@ -158,7 +158,7 @@ static inline const unsigned char *mbedtls_buffer_offset_const(
     return p == NULL ? NULL : p + n;
 }
 
-/* Always inline mbedtls_xor for similar reasons as mbedtls_xor_no_simd. */
+/* Always inline mbedtls_xor() for similar reasons as mbedtls_xor_no_simd(). */
 #if defined(__IAR_SYSTEMS_ICC__)
 #pragma inline = forced
 #elif defined(__GNUC__)
@@ -175,12 +175,12 @@ __attribute__((always_inline))
  * \param   b Pointer to input (buffer of at least \p n bytes)
  * \param   n Number of bytes to process.
  *
- * \note      Depending on the situation, it may be faster to use either mbedtls_xor or
- *            mbedtls_xor_no_simd (these are functionally equivalent).
+ * \note      Depending on the situation, it may be faster to use either mbedtls_xor() or
+ *            mbedtls_xor_no_simd() (these are functionally equivalent).
  *            If the result is used immediately after the xor operation in non-SIMD code (e.g, in
  *            AES-CBC), there may be additional latency to transfer the data from SIMD to scalar
- *            registers, and in this case, mbedtls_xor_no_simd may be faster. In other cases where
- *            the result is not used immediately (e.g., in AES-CTR), mbedtls_xor may be faster.
+ *            registers, and in this case, mbedtls_xor_no_simd() may be faster. In other cases where
+ *            the result is not used immediately (e.g., in AES-CTR), mbedtls_xor() may be faster.
  *            For targets without SIMD support, they will behave the same.
  */
 static inline void mbedtls_xor(unsigned char *r,
@@ -199,10 +199,10 @@ static inline void mbedtls_xor(unsigned char *r,
         uint8x16_t x = veorq_u8(v1, v2);
         vst1q_u8(r + i, x);
     }
-    // This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case
-    // where n is a constant multiple of 16.
-    // It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time
-    // constant, and very little difference if n is not a compile-time constant.
+    /* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case
+     * where n is a constant multiple of 16.
+     * It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time
+     * constant, and very little difference if n is not a compile-time constant. */
     if (n % 16 != 0)
 #elif defined(MBEDTLS_ARCH_IS_X64) || defined(MBEDTLS_ARCH_IS_ARM64)
     /* This codepath probably only makes sense on architectures with 64-bit registers */
@@ -226,7 +226,7 @@ static inline void mbedtls_xor(unsigned char *r,
     }
 }
 
-/* Always inline mbedtls_xor_no_simd as we see significant perf regressions when it does not get
+/* Always inline mbedtls_xor_no_simd() as we see significant perf regressions when it does not get
  * inlined (e.g., observed about 3x perf difference in gcm_mult_largetable with gcc 7 - 12) */
 #if defined(__IAR_SYSTEMS_ICC__)
 #pragma inline = forced
@@ -237,7 +237,7 @@ __attribute__((always_inline))
  * Perform a fast block XOR operation, such that
  * r[i] = a[i] ^ b[i] where 0 <= i < n
  *
- * In some situations, this can perform better than mbedtls_xor (e.g., it's about 5%
+ * In some situations, this can perform better than mbedtls_xor() (e.g., it's about 5%
  * better in AES-CBC).
  *
  * \param   r Pointer to result (buffer of at least \p n bytes). \p r
@@ -247,12 +247,12 @@ __attribute__((always_inline))
  * \param   b Pointer to input (buffer of at least \p n bytes)
  * \param   n Number of bytes to process.
  *
- * \note      Depending on the situation, it may be faster to use either mbedtls_xor or
- *            mbedtls_xor_no_simd (these are functionally equivalent).
+ * \note      Depending on the situation, it may be faster to use either mbedtls_xor() or
+ *            mbedtls_xor_no_simd() (these are functionally equivalent).
  *            If the result is used immediately after the xor operation in non-SIMD code (e.g, in
  *            AES-CBC), there may be additional latency to transfer the data from SIMD to scalar
- *            registers, and in this case, mbedtls_xor_no_simd may be faster. In other cases where
- *            the result is not used immediately (e.g., in AES-CTR), mbedtls_xor may be faster.
+ *            registers, and in this case, mbedtls_xor_no_simd() may be faster. In other cases where
+ *            the result is not used immediately (e.g., in AES-CTR), mbedtls_xor() may be faster.
  *            For targets without SIMD support, they will behave the same.
  */
 static inline void mbedtls_xor_no_simd(unsigned char *r,
@@ -268,10 +268,10 @@ static inline void mbedtls_xor_no_simd(unsigned char *r,
         uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
         mbedtls_put_unaligned_uint64(r + i, x);
     }
-    // This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case
-    // where n is a constant multiple of 8.
-    // It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time
-    // constant, and very little difference if n is not a compile-time constant.
+    /* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case
+     * where n is a constant multiple of 16.
+     * It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time
+     * constant, and very little difference if n is not a compile-time constant. */
     if (n % 8 != 0)
 #else
     for (; (i + 4) <= n; i += 4) {

From 336efeec50cc237a7c1e03a0744d3188a7f805fd Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 19 Jan 2024 16:38:53 +0000
Subject: [PATCH 09/12] Move MBEDTLS_COMPILER_IS_GCC & MBEDTLS_GCC_VERSION into
 build_info

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 include/mbedtls/build_info.h | 8 ++++++++
 library/alignment.h          | 8 +-------
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/include/mbedtls/build_info.h b/include/mbedtls/build_info.h
index 7a70e2543..c0b724c83 100644
--- a/include/mbedtls/build_info.h
+++ b/include/mbedtls/build_info.h
@@ -83,6 +83,14 @@
 #endif
 #endif
 
+#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \
+    && !defined(__llvm__) && !defined(__INTEL_COMPILER)
+/* Defined if the compiler really is gcc and not clang, etc */
+#define MBEDTLS_COMPILER_IS_GCC
+#define MBEDTLS_GCC_VERSION \
+    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
+#endif
+
 #if defined(_MSC_VER) && !defined(_CRT_SECURE_NO_DEPRECATE)
 #define _CRT_SECURE_NO_DEPRECATE 1
 #endif
diff --git a/library/alignment.h b/library/alignment.h
index b61301922..26f15261c 100644
--- a/library/alignment.h
+++ b/library/alignment.h
@@ -15,13 +15,7 @@
 #include <string.h>
 #include <stdlib.h>
 
-#if defined(__GNUC__) && !defined(__ARMCC_VERSION) && !defined(__clang__) \
-    && !defined(__llvm__) && !defined(__INTEL_COMPILER)
-/* Defined if the compiler really is gcc and not clang, etc */
-#define MBEDTLS_COMPILER_IS_GCC
-#define MBEDTLS_GCC_VERSION \
-    (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
-#endif
+#include "mbedtls/build_info.h"
 
 /*
  * Define MBEDTLS_EFFICIENT_UNALIGNED_ACCESS for architectures where unaligned memory

From 075f8797ac35925089116959c98366ccd2cb00e6 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Fri, 19 Jan 2024 16:48:42 +0000
Subject: [PATCH 10/12] Remove include of build_info.h

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/alignment.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/library/alignment.h b/library/alignment.h
index 26f15261c..248f29bc7 100644
--- a/library/alignment.h
+++ b/library/alignment.h
@@ -15,8 +15,6 @@
 #include <string.h>
 #include <stdlib.h>
 
-#include "mbedtls/build_info.h"
-
 /*
  * Define MBEDTLS_EFFICIENT_UNALIGNED_ACCESS for architectures where unaligned memory
  * accesses are known to be efficient.

From 00b530e3957061a06663e1785dc923ee0b7e7c95 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Tue, 23 Jan 2024 09:36:34 +0000
Subject: [PATCH 11/12] Limit compiler hint to compilers that are known to
 benefit from it

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/common.h | 50 ++++++++++++++++++++++++++++++++----------------
 1 file changed, 33 insertions(+), 17 deletions(-)

diff --git a/library/common.h b/library/common.h
index 2eb917037..937c80284 100644
--- a/library/common.h
+++ b/library/common.h
@@ -199,30 +199,40 @@ static inline void mbedtls_xor(unsigned char *r,
         uint8x16_t x = veorq_u8(v1, v2);
         vst1q_u8(r + i, x);
     }
+#if defined(__IAR_SYSTEMS_ICC__)
     /* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case
      * where n is a constant multiple of 16.
-     * It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time
-     * constant, and very little difference if n is not a compile-time constant. */
-    if (n % 16 != 0)
+     * For other compilers (e.g. recent gcc and clang) it makes no difference if n is a compile-time
+     * constant, and is a very small perf regression if n is not a compile-time constant. */
+    if (n % 16 == 0) {
+        return;
+    }
+#endif
 #elif defined(MBEDTLS_ARCH_IS_X64) || defined(MBEDTLS_ARCH_IS_ARM64)
     /* This codepath probably only makes sense on architectures with 64-bit registers */
     for (; (i + 8) <= n; i += 8) {
         uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
         mbedtls_put_unaligned_uint64(r + i, x);
     }
-    if (n % 8 != 0)
+#if defined(__IAR_SYSTEMS_ICC__)
+    if (n % 8 == 0) {
+        return;
+    }
+#endif
 #else
     for (; (i + 4) <= n; i += 4) {
         uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
         mbedtls_put_unaligned_uint32(r + i, x);
     }
-    if (n % 4 != 0)
+#if defined(__IAR_SYSTEMS_ICC__)
+    if (n % 4 == 0) {
+        return;
+    }
 #endif
 #endif
-    {
-        for (; i < n; i++) {
-            r[i] = a[i] ^ b[i];
-        }
+#endif
+    for (; i < n; i++) {
+        r[i] = a[i] ^ b[i];
     }
 }
 
@@ -268,23 +278,29 @@ static inline void mbedtls_xor_no_simd(unsigned char *r,
         uint64_t x = mbedtls_get_unaligned_uint64(a + i) ^ mbedtls_get_unaligned_uint64(b + i);
         mbedtls_put_unaligned_uint64(r + i, x);
     }
+#if defined(__IAR_SYSTEMS_ICC__)
     /* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case
      * where n is a constant multiple of 16.
-     * It makes no difference for others (e.g. recent gcc and clang) if n is a compile-time
-     * constant, and very little difference if n is not a compile-time constant. */
-    if (n % 8 != 0)
+     * For other compilers (e.g. recent gcc and clang) it makes no difference if n is a compile-time
+     * constant, and is a very small perf regression if n is not a compile-time constant. */
+    if (n % 8 == 0) {
+        return;
+    }
+#endif
 #else
     for (; (i + 4) <= n; i += 4) {
         uint32_t x = mbedtls_get_unaligned_uint32(a + i) ^ mbedtls_get_unaligned_uint32(b + i);
         mbedtls_put_unaligned_uint32(r + i, x);
     }
-    if (n % 4 != 0)
+#if defined(__IAR_SYSTEMS_ICC__)
+    if (n % 4 == 0) {
+        return;
+    }
 #endif
 #endif
-    {
-        for (; i < n; i++) {
-            r[i] = a[i] ^ b[i];
-        }
+#endif
+    for (; i < n; i++) {
+        r[i] = a[i] ^ b[i];
     }
 }
 

From c64280a2d71f6e88835787b2121857f063af7029 Mon Sep 17 00:00:00 2001
From: Dave Rodgman <dave.rodgman@arm.com>
Date: Tue, 23 Jan 2024 10:03:22 +0000
Subject: [PATCH 12/12] Fix comment typo

Signed-off-by: Dave Rodgman <dave.rodgman@arm.com>
---
 library/common.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/library/common.h b/library/common.h
index 937c80284..3936ffdfe 100644
--- a/library/common.h
+++ b/library/common.h
@@ -280,7 +280,7 @@ static inline void mbedtls_xor_no_simd(unsigned char *r,
     }
 #if defined(__IAR_SYSTEMS_ICC__)
     /* This if statement helps some compilers (e.g., IAR) optimise out the byte-by-byte tail case
-     * where n is a constant multiple of 16.
+     * where n is a constant multiple of 8.
      * For other compilers (e.g. recent gcc and clang) it makes no difference if n is a compile-time
      * constant, and is a very small perf regression if n is not a compile-time constant. */
     if (n % 8 == 0) {