diff --git a/library/constant_time.c b/library/constant_time.c
index 12aed13f3..d3c69cfa8 100644
--- a/library/constant_time.c
+++ b/library/constant_time.c
@@ -150,8 +150,13 @@ void mbedtls_ct_memcpy_if(mbedtls_ct_condition_t condition,
                           const unsigned char *src2,
                           size_t len)
 {
+#if defined(MBEDTLS_CT_SIZE_64)
+    const uint64_t mask     = (uint64_t) condition;
+    const uint64_t not_mask = (uint64_t) ~mbedtls_ct_compiler_opaque(condition);
+#else
     const uint32_t mask     = (uint32_t) condition;
     const uint32_t not_mask = (uint32_t) ~mbedtls_ct_compiler_opaque(condition);
+#endif
 
     /* If src2 is NULL, setup src2 so that we read from the destination address.
      *
@@ -165,11 +170,19 @@ void mbedtls_ct_memcpy_if(mbedtls_ct_condition_t condition,
     /* dest[i] = c1 == c2 ? src[i] : dest[i] */
     size_t i = 0;
 #if defined(MBEDTLS_EFFICIENT_UNALIGNED_ACCESS)
+#if defined(MBEDTLS_CT_SIZE_64)
+    for (; (i + 8) <= len; i += 8) {
+        uint64_t a = mbedtls_get_unaligned_uint64(src1 + i) & mask;
+        uint64_t b = mbedtls_get_unaligned_uint64(src2 + i) & not_mask;
+        mbedtls_put_unaligned_uint64(dest + i, a | b);
+    }
+#else
     for (; (i + 4) <= len; i += 4) {
         uint32_t a = mbedtls_get_unaligned_uint32(src1 + i) & mask;
         uint32_t b = mbedtls_get_unaligned_uint32(src2 + i) & not_mask;
         mbedtls_put_unaligned_uint32(dest + i, a | b);
     }
+#endif /* defined(MBEDTLS_CT_SIZE_64) */
 #endif /* MBEDTLS_EFFICIENT_UNALIGNED_ACCESS */
     for (; i < len; i++) {
         dest[i] = (src1[i] & mask) | (src2[i] & not_mask);
diff --git a/library/constant_time_impl.h b/library/constant_time_impl.h
index b251a664c..8da15a84c 100644
--- a/library/constant_time_impl.h
+++ b/library/constant_time_impl.h
@@ -48,8 +48,14 @@
     #pragma GCC diagnostic ignored "-Wredundant-decls"
 #endif
 
-/* Disable asm under Memsan because it confuses Memsan and generates false errors */
-#if defined(MBEDTLS_TEST_CONSTANT_FLOW_MEMSAN)
+/* Disable asm under Memsan because it confuses Memsan and generates false errors.
+ *
+ * We also disable under Valgrind by default, because it's more useful
+ * for Valgrind to test the plain C implementation. MBEDTLS_TEST_CONSTANT_FLOW_ASM //no-check-names
+ * may be set to permit building asm under Valgrind.
+ */
+#if defined(MBEDTLS_TEST_CONSTANT_FLOW_MEMSAN) || \
+    (defined(MBEDTLS_TEST_CONSTANT_FLOW_VALGRIND) && !defined(MBEDTLS_TEST_CONSTANT_FLOW_ASM)) //no-check-names
 #define MBEDTLS_CT_NO_ASM
 #elif defined(__has_feature)
 #if __has_feature(memory_sanitizer)
@@ -109,6 +115,28 @@ static inline mbedtls_ct_uint_t mbedtls_ct_compiler_opaque(mbedtls_ct_uint_t x)
 #endif
 }
 
+/*
+ * Selecting unified syntax is needed for gcc, and harmless on clang.
+ *
+ * This is needed because on Thumb 1, condition flags are always set, so
+ * e.g. "negs" is supported but "neg" is not (on Thumb 2, both exist).
+ *
+ * Under Thumb 1 unified syntax, only the "negs" form is accepted, and
+ * under divided syntax, only the "neg" form is accepted. clang only
+ * supports unified syntax.
+ *
+ * On Thumb 2 and Arm, both compilers are happy with the "s" suffix,
+ * although we don't actually care about setting the flags.
+ *
+ * For gcc, restore divided syntax afterwards - otherwise old versions of gcc
+ * seem to apply unified syntax globally, which breaks other asm code.
+ */
+#if !defined(__clang__)
+#define RESTORE_ASM_SYNTAX  ".syntax divided             \n\t"
+#else
+#define RESTORE_ASM_SYNTAX
+#endif
+
 /* Convert a number into a condition in constant time. */
 static inline mbedtls_ct_condition_t mbedtls_ct_bool(mbedtls_ct_uint_t x)
 {
@@ -120,6 +148,34 @@ static inline mbedtls_ct_condition_t mbedtls_ct_bool(mbedtls_ct_uint_t x)
      * Otherwise, we define a plain C fallback which (in May 2023) does not get optimised into
      * conditional instructions or branches by trunk clang, gcc, or MSVC v19.
      */
+#if defined(MBEDTLS_CT_AARCH64_ASM) && (defined(MBEDTLS_CT_SIZE_32) || defined(MBEDTLS_CT_SIZE_64))
+    mbedtls_ct_uint_t s;
+    asm volatile ("neg %x[s], %x[x]                     \n\t"
+                  "orr %x[x], %x[s], %x[x]              \n\t"
+                  "asr %x[x], %x[x], 63"
+                  :
+                  [s] "=&r" (s),
+                  [x] "+&r" (x)
+                  :
+                  :
+                  );
+    return (mbedtls_ct_condition_t) x;
+#elif defined(MBEDTLS_CT_ARM_ASM) && defined(MBEDTLS_CT_SIZE_32)
+    uint32_t s;
+    asm volatile (".syntax unified                       \n\t"
+                  "negs %[s], %[x]                       \n\t"
+                  "orrs %[x], %[x], %[s]                 \n\t"
+                  "asrs %[x], %[x], #31                  \n\t"
+                  RESTORE_ASM_SYNTAX
+                  :
+                  [s] "=&l" (s),
+                  [x] "+&l" (x)
+                  :
+                  :
+                  "cc" /* clobbers flag bits */
+                  );
+    return (mbedtls_ct_condition_t) x;
+#else
     const mbedtls_ct_uint_t xo = mbedtls_ct_compiler_opaque(x);
 #if defined(_MSC_VER)
     /* MSVC has a warning about unary minus on unsigned, but this is
@@ -127,24 +183,98 @@ static inline mbedtls_ct_condition_t mbedtls_ct_bool(mbedtls_ct_uint_t x)
 #pragma warning( push )
 #pragma warning( disable : 4146 )
 #endif
-    return (mbedtls_ct_condition_t) (((mbedtls_ct_int_t) ((-xo) | -(xo >> 1))) >>
-                                     (MBEDTLS_CT_SIZE - 1));
+    // y is negative (i.e., top bit set) iff x is non-zero
+    mbedtls_ct_int_t y = (-xo) | -(xo >> 1);
+
+    // extract only the sign bit of y so that y == 1 (if x is non-zero) or 0 (if x is zero)
+    y = (((mbedtls_ct_uint_t) y) >> (MBEDTLS_CT_SIZE - 1));
+
+    // -y has all bits set (if x is non-zero), or all bits clear (if x is zero)
+    return (mbedtls_ct_condition_t) (-y);
 #if defined(_MSC_VER)
 #pragma warning( pop )
 #endif
+#endif
 }
 
 static inline mbedtls_ct_uint_t mbedtls_ct_if(mbedtls_ct_condition_t condition,
                                               mbedtls_ct_uint_t if1,
                                               mbedtls_ct_uint_t if0)
 {
+#if defined(MBEDTLS_CT_AARCH64_ASM) && (defined(MBEDTLS_CT_SIZE_32) || defined(MBEDTLS_CT_SIZE_64))
+    asm volatile ("and %x[if1], %x[if1], %x[condition]       \n\t"
+                  "mvn %x[condition], %x[condition]          \n\t"
+                  "and %x[condition], %x[condition], %x[if0] \n\t"
+                  "orr %x[condition], %x[if1], %x[condition]"
+                  :
+                  [condition] "+&r" (condition),
+                  [if1] "+&r" (if1)
+                  :
+                  [if0] "r" (if0)
+                  :
+                  );
+    return (mbedtls_ct_uint_t) condition;
+#elif defined(MBEDTLS_CT_ARM_ASM) && defined(MBEDTLS_CT_SIZE_32)
+    asm volatile (".syntax unified                           \n\t"
+                  "ands %[if1], %[if1], %[condition]         \n\t"
+                  "mvns %[condition], %[condition]           \n\t"
+                  "ands %[condition], %[condition], %[if0]   \n\t"
+                  "orrs %[condition], %[if1], %[condition]   \n\t"
+                  RESTORE_ASM_SYNTAX
+                  :
+                  [condition] "+&l" (condition),
+                  [if1] "+&l" (if1)
+                  :
+                  [if0] "l" (if0)
+                  :
+                  "cc"
+                  );
+    return (mbedtls_ct_uint_t) condition;
+#else
     mbedtls_ct_condition_t not_cond =
         (mbedtls_ct_condition_t) (~mbedtls_ct_compiler_opaque(condition));
     return (mbedtls_ct_uint_t) ((condition & if1) | (not_cond & if0));
+#endif
 }
 
 static inline mbedtls_ct_condition_t mbedtls_ct_uint_lt(mbedtls_ct_uint_t x, mbedtls_ct_uint_t y)
 {
+#if defined(MBEDTLS_CT_AARCH64_ASM) && (defined(MBEDTLS_CT_SIZE_32) || defined(MBEDTLS_CT_SIZE_64))
+    uint64_t s1;
+    asm volatile ("eor     %x[s1], %x[y], %x[x]          \n\t"
+                  "sub     %x[x], %x[x], %x[y]           \n\t"
+                  "bic     %x[x], %x[x], %x[s1]          \n\t"
+                  "and     %x[s1], %x[s1], %x[y]         \n\t"
+                  "orr     %x[s1], %x[x], %x[s1]         \n\t"
+                  "asr     %x[x], %x[s1], 63"
+                  : [s1] "=&r" (s1), [x] "+&r" (x)
+                  : [y] "r" (y)
+                  :
+                  );
+    return (mbedtls_ct_condition_t) x;
+#elif defined(MBEDTLS_CT_ARM_ASM) && defined(MBEDTLS_CT_SIZE_32)
+    uint32_t s1;
+    asm volatile (
+        ".syntax unified                    \n\t"
+#if defined(__thumb__) && !defined(__thumb2__)
+        "movs     %[s1], %[x]               \n\t"
+        "eors     %[s1], %[s1], %[y]        \n\t"
+#else
+        "eors     %[s1], %[x], %[y]         \n\t"
+#endif
+        "subs    %[x], %[x], %[y]           \n\t"
+        "bics    %[x], %[x], %[s1]          \n\t"
+        "ands    %[y], %[s1], %[y]          \n\t"
+        "orrs    %[x], %[x], %[y]           \n\t"
+        "asrs    %[x], %[x], #31            \n\t"
+        RESTORE_ASM_SYNTAX
+        : [s1] "=&l" (s1), [x] "+&l" (x),  [y] "+&l" (y)
+        :
+        :
+        "cc"
+        );
+    return (mbedtls_ct_condition_t) x;
+#else
     /* Ensure that the compiler cannot optimise the following operations over x and y,
      * even if it knows the value of x and y.
      */
@@ -173,6 +303,7 @@ static inline mbedtls_ct_condition_t mbedtls_ct_uint_lt(mbedtls_ct_uint_t x, mbe
 
     // Convert to a condition (i.e., all bits set iff non-zero)
     return mbedtls_ct_bool(ret);
+#endif
 }
 
 static inline mbedtls_ct_condition_t mbedtls_ct_uint_ne(mbedtls_ct_uint_t x, mbedtls_ct_uint_t y)
diff --git a/library/constant_time_internal.h b/library/constant_time_internal.h
index dabf720aa..44b74aec6 100644
--- a/library/constant_time_internal.h
+++ b/library/constant_time_internal.h
@@ -85,12 +85,14 @@ typedef ptrdiff_t mbedtls_ct_int_t;
 typedef uint64_t  mbedtls_ct_condition_t;
 typedef uint64_t  mbedtls_ct_uint_t;
 typedef int64_t   mbedtls_ct_int_t;
+#define MBEDTLS_CT_SIZE_64
 #define MBEDTLS_CT_TRUE  ((mbedtls_ct_condition_t) mbedtls_ct_compiler_opaque(UINT64_MAX))
 #else
 /* Pointer size <= 32-bit, and no 64-bit MPIs */
 typedef uint32_t  mbedtls_ct_condition_t;
 typedef uint32_t  mbedtls_ct_uint_t;
 typedef int32_t   mbedtls_ct_int_t;
+#define MBEDTLS_CT_SIZE_32
 #define MBEDTLS_CT_TRUE  ((mbedtls_ct_condition_t) mbedtls_ct_compiler_opaque(UINT32_MAX))
 #endif
 #define MBEDTLS_CT_FALSE ((mbedtls_ct_condition_t) mbedtls_ct_compiler_opaque(0))
diff --git a/tests/scripts/all.sh b/tests/scripts/all.sh
index 8e978ac72..c3c12752f 100755
--- a/tests/scripts/all.sh
+++ b/tests/scripts/all.sh
@@ -1872,6 +1872,16 @@ skip_suites_without_constant_flow () {
     export SKIP_TEST_SUITES
 }
 
+skip_all_except_given_suite () {
+    # Skip all but the given test suite
+    SKIP_TEST_SUITES=$(
+        ls -1 tests/suites/test_suite_*.function |
+        grep -v $1.function |
+         sed 's/tests.suites.test_suite_//; s/\.function$//' |
+        tr '\n' ,)
+    export SKIP_TEST_SUITES
+}
+
 component_test_memsan_constant_flow () {
     # This tests both (1) accesses to undefined memory, and (2) branches or
     # memory access depending on secret values. To distinguish between those:
@@ -1931,6 +1941,16 @@ component_test_valgrind_constant_flow () {
     # details are left in Testing/<date>/DynamicAnalysis.xml
     msg "test: some suites (full minus MBEDTLS_USE_PSA_CRYPTO, valgrind + constant flow)"
     make memcheck
+
+    # Test asm path in constant time module - by default, it will test the plain C
+    # path under Valgrind or Memsan. Running only the constant_time tests is fast (<1s)
+    msg "test: valgrind asm constant_time"
+    scripts/config.py --force set MBEDTLS_TEST_CONSTANT_FLOW_ASM
+    skip_all_except_given_suite test_suite_constant_time
+    cmake -D CMAKE_BUILD_TYPE:String=Release .
+    make clean
+    make
+    make memcheck
 }
 
 component_test_valgrind_constant_flow_psa () {