From 606cb1626f49b00d5c246b03bdda1c4ad086545e Mon Sep 17 00:00:00 2001
From: Hanno Becker <hanno.becker@arm.com>
Date: Sun, 17 Apr 2022 06:57:34 +0100
Subject: [PATCH] Add comment explaining structure of UMAAL assembly

Signed-off-by: Hanno Becker <hanno.becker@arm.com>
---
 library/bn_mul.h | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/library/bn_mul.h b/library/bn_mul.h
index 275be6048..962d7a97b 100644
--- a/library/bn_mul.h
+++ b/library/bn_mul.h
@@ -739,6 +739,16 @@
         mbedtls_mpi_uint tmp_a1, tmp_b1;             \
         asm volatile (
 
+            /* - Make sure loop is 4-byte aligned to avoid stalls
+             *   upon repeated non-word aligned instructions in
+             *   some microarchitectures.
+             * - Don't use ldm with post-increment or back-to-back
+             *   loads with post-increment and same address register
+             *   to avoid stalls on some microarchitectures.
+             * - Bunch loads and stores to reduce latency on some
+             *   microarchitectures. E.g., on Cortex-M4, the first
+             *   in a series of load/store operations has latency
+             *   2 cycles, while subsequent loads/stores are single-cycle. */
 #define MULADDC_X2_CORE                                           \
            ".p2align  2                                   \n\t"   \
             "ldr.w    %[a0], [%[in]],  #+8                \n\t"   \