Merge pull request #5632 from tom-cosgrove-arm/seclib-667-sha512-acceleration-mbedtls-internal

SECLIB-667: Accelerate SHA-512 with A64 crypto extensions
2022-03-29 15:34:12 +01:00 · 2022-03-29 15:34:12 +01:00 · 1c41501949
commit 1c41501949
parent 63d97ad0bb 226aca195f
7 changed files with 563 additions and 19 deletions
--- a/library/sha256.c
+++ b/library/sha256.c
@ -49,8 +49,15 @@
      defined(MBEDTLS_SHA256_USE_A64_CRYPTO_ONLY)
 #    include <arm_neon.h>
 #  endif
-#  if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) && defined(__linux__)
-#    include <sys/auxv.h>
+#  if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT)
+#    if defined(__unix__)
+#      if defined(__linux__)
+         /* Our preferred method of detection is getauxval() */
+#        include <sys/auxv.h>
+#      endif
+       /* Use SIGILL on Unix, and fall back to it on Linux */
+#      include <signal.h>
+#    endif
 #  endif
 #elif defined(_M_ARM64)
 #  if defined(MBEDTLS_SHA256_USE_A64_CRYPTO_IF_PRESENT) || \
@ -272,10 +279,10 @@ static size_t mbedtls_internal_sha256_process_many_a64_crypto(
        uint32x4_t abcd_orig = abcd;
        uint32x4_t efgh_orig = efgh;

-        uint32x4_t sched0 = vld1q_u32( (const uint32_t *)( msg + 16 * 0 ) );
-        uint32x4_t sched1 = vld1q_u32( (const uint32_t *)( msg + 16 * 1 ) );
-        uint32x4_t sched2 = vld1q_u32( (const uint32_t *)( msg + 16 * 2 ) );
-        uint32x4_t sched3 = vld1q_u32( (const uint32_t *)( msg + 16 * 3 ) );
+        uint32x4_t sched0 = (uint32x4_t) vld1q_u8( msg + 16 * 0 );
+        uint32x4_t sched1 = (uint32x4_t) vld1q_u8( msg + 16 * 1 );
+        uint32x4_t sched2 = (uint32x4_t) vld1q_u8( msg + 16 * 2 );
+        uint32x4_t sched3 = (uint32x4_t) vld1q_u8( msg + 16 * 3 );

 #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__  /* Will be true if not defined */
                                               /* Untested on BE */
--- a/library/sha512.c
+++ b/library/sha512.c
@ -50,12 +50,128 @@
 #endif /* MBEDTLS_PLATFORM_C */
 #endif /* MBEDTLS_SELF_TEST */

+#if defined(__aarch64__)
+#  if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) || \
+      defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+#    include <arm_neon.h>
+#  endif
+#  if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT)
+#    if defined(__unix__)
+#      if defined(__linux__)
+         /* Our preferred method of detection is getauxval() */
+#        include <sys/auxv.h>
+#      endif
+       /* Use SIGILL on Unix, and fall back to it on Linux */
+#      include <signal.h>
+#    endif
+#  endif
+#elif defined(_M_ARM64)
+#  if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) || \
+      defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+#    include <arm64_neon.h>
+#  endif
+#else
+#  undef MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY
+#  undef MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT
+#endif
+
+#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT)
+/*
+ * Capability detection code comes early, so we can disable
+ * MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT if no detection mechanism found
+ */
+#if defined(HWCAP_SHA512)
+static int mbedtls_a64_crypto_sha512_determine_support( void )
+{
+    return( ( getauxval( AT_HWCAP ) & HWCAP_SHA512 ) ? 1 : 0 );
+}
+#elif defined(__APPLE__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+static int mbedtls_a64_crypto_sha512_determine_support( void )
+{
+    int value = 0;
+    size_t value_len = sizeof(value);
+
+    int ret = sysctlbyname( "hw.optional.armv8_2_sha512", &value, &value_len,
+                            NULL, 0 );
+    return( ret == 0 && value != 0 );
+}
+#elif defined(_M_ARM64)
+/*
+ * As of March 2022, there don't appear to be any PF_ARM_V8_* flags
+ * available to pass to IsProcessorFeaturePresent() to check for
+ * SHA-512 support. So we fall back to the C code only.
+ */
+#if defined(_MSC_VER)
+#pragma message "No mechanism to detect A64_CRYPTO found, using C code only"
+#else
+#warning "No mechanism to detect A64_CRYPTO found, using C code only"
+#endif
+#elif defined(__unix__) && defined(SIG_SETMASK)
+/* Detection with SIGILL, setjmp() and longjmp() */
+#include <signal.h>
+#include <setjmp.h>
+
+#ifndef asm
+#define asm __asm__
+#endif
+
+static jmp_buf return_from_sigill;
+
+/*
+ * A64 SHA512 support detection via SIGILL
+ */
+static void sigill_handler( int signal )
+{
+    (void) signal;
+    longjmp( return_from_sigill, 1 );
+}
+
+static int mbedtls_a64_crypto_sha512_determine_support( void )
+{
+    struct sigaction old_action, new_action;
+
+    sigset_t old_mask;
+    if( sigprocmask( 0, NULL, &old_mask ) )
+        return( 0 );
+
+    sigemptyset( &new_action.sa_mask );
+    new_action.sa_flags = 0;
+    new_action.sa_handler = sigill_handler;
+
+    sigaction( SIGILL, &new_action, &old_action );
+
+    static int ret = 0;
+
+    if( setjmp( return_from_sigill ) == 0 )        /* First return only */
+    {
+        /* If this traps, we will return a second time from setjmp() with 1 */
+        asm( "sha512h q0, q0, v0.2d" : : : "v0" );
+        ret = 1;
+    }
+
+    sigaction( SIGILL, &old_action, NULL );
+    sigprocmask( SIG_SETMASK, &old_mask, NULL );
+
+    return( ret );
+}
+#else
+#warning "No mechanism to detect A64_CRYPTO found, using C code only"
+#undef MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT
+#endif  /* HWCAP_SHA512, __APPLE__, __unix__ && SIG_SETMASK */
+
+#endif  /* MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT */
+
 #define SHA512_VALIDATE_RET(cond)                           \
    MBEDTLS_INTERNAL_VALIDATE_RET( cond, MBEDTLS_ERR_SHA512_BAD_INPUT_DATA )
 #define SHA512_VALIDATE(cond)  MBEDTLS_INTERNAL_VALIDATE( cond )

 #if !defined(MBEDTLS_SHA512_ALT)

+#define SHA512_BLOCK_SIZE 128
+
 #if defined(MBEDTLS_SHA512_SMALLER)
 static void sha512_put_uint64_be( uint64_t n, unsigned char *b, uint8_t i )
 {
@ -188,9 +304,249 @@ static const uint64_t K[80] =
    UL64(0x4CC5D4BECB3E42B6),  UL64(0x597F299CFC657E2A),
    UL64(0x5FCB6FAB3AD6FAEC),  UL64(0x6C44198C4A475817)
 };
+#endif

-int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx,
-                                     const unsigned char data[128] )
+#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT) || \
+    defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+
+#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+#  define mbedtls_internal_sha512_process_many_a64_crypto mbedtls_internal_sha512_process_many
+#  define mbedtls_internal_sha512_process_a64_crypto      mbedtls_internal_sha512_process
+#endif
+
+#ifndef asm
+#define asm __asm__
+#endif
+
+/* Accelerated SHA-512 implementation originally written by Simon Tatham for PuTTY,
+ * under the MIT licence; dual-licensed as Apache 2 with his kind permission.
+ */
+
+#if defined(__clang__) && \
+           (__clang_major__ < 13 || \
+           (__clang_major__ == 13 && __clang_minor__ == 0 && __clang_patchlevel__ == 0))
+static inline uint64x2_t vsha512su0q_u64(uint64x2_t x, uint64x2_t y)
+{
+    asm( "sha512su0 %0.2D,%1.2D" : "+w" (x) : "w" (y) );
+    return( x );
+}
+static inline uint64x2_t vsha512su1q_u64(uint64x2_t x, uint64x2_t y, uint64x2_t z)
+{
+    asm( "sha512su1 %0.2D,%1.2D,%2.2D" : "+w" (x) : "w" (y), "w" (z) );
+    return( x );
+}
+static inline uint64x2_t vsha512hq_u64(uint64x2_t x, uint64x2_t y, uint64x2_t z)
+{
+    asm( "sha512h %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z) );
+    return( x );
+}
+static inline uint64x2_t vsha512h2q_u64(uint64x2_t x, uint64x2_t y, uint64x2_t z)
+{
+    asm( "sha512h2 %0,%1,%2.2D" : "+w" (x) : "w" (y), "w" (z) );
+    return( x );
+}
+#endif  /* __clang__ etc */
+
+static size_t mbedtls_internal_sha512_process_many_a64_crypto(
+                  mbedtls_sha512_context *ctx, const uint8_t *msg, size_t len )
+{
+    uint64x2_t ab = vld1q_u64( &ctx->state[0] );
+    uint64x2_t cd = vld1q_u64( &ctx->state[2] );
+    uint64x2_t ef = vld1q_u64( &ctx->state[4] );
+    uint64x2_t gh = vld1q_u64( &ctx->state[6] );
+
+    size_t processed = 0;
+
+    for ( ;
+          len >= SHA512_BLOCK_SIZE;
+          processed += SHA512_BLOCK_SIZE,
+                msg += SHA512_BLOCK_SIZE,
+                len -= SHA512_BLOCK_SIZE )
+    {
+        uint64x2_t initial_sum, sum, intermed;
+
+        uint64x2_t ab_orig = ab;
+        uint64x2_t cd_orig = cd;
+        uint64x2_t ef_orig = ef;
+        uint64x2_t gh_orig = gh;
+
+        uint64x2_t s0 = (uint64x2_t) vld1q_u8( msg + 16 * 0 );
+        uint64x2_t s1 = (uint64x2_t) vld1q_u8( msg + 16 * 1 );
+        uint64x2_t s2 = (uint64x2_t) vld1q_u8( msg + 16 * 2 );
+        uint64x2_t s3 = (uint64x2_t) vld1q_u8( msg + 16 * 3 );
+        uint64x2_t s4 = (uint64x2_t) vld1q_u8( msg + 16 * 4 );
+        uint64x2_t s5 = (uint64x2_t) vld1q_u8( msg + 16 * 5 );
+        uint64x2_t s6 = (uint64x2_t) vld1q_u8( msg + 16 * 6 );
+        uint64x2_t s7 = (uint64x2_t) vld1q_u8( msg + 16 * 7 );
+
+#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__  /* assume LE if these not defined; untested on BE */
+        s0 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s0 ) ) );
+        s1 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s1 ) ) );
+        s2 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s2 ) ) );
+        s3 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s3 ) ) );
+        s4 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s4 ) ) );
+        s5 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s5 ) ) );
+        s6 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s6 ) ) );
+        s7 = vreinterpretq_u64_u8( vrev64q_u8( vreinterpretq_u8_u64( s7 ) ) );
+#endif
+
+        /* Rounds 0 and 1 */
+        initial_sum = vaddq_u64( s0, vld1q_u64( &K[0] ) );
+        sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh );
+        intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) );
+        gh = vsha512h2q_u64( intermed, cd, ab );
+        cd = vaddq_u64( cd, intermed );
+
+        /* Rounds 2 and 3 */
+        initial_sum = vaddq_u64( s1, vld1q_u64( &K[2] ) );
+        sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef );
+        intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) );
+        ef = vsha512h2q_u64( intermed, ab, gh );
+        ab = vaddq_u64( ab, intermed );
+
+        /* Rounds 4 and 5 */
+        initial_sum = vaddq_u64( s2, vld1q_u64( &K[4] ) );
+        sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd );
+        intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) );
+        cd = vsha512h2q_u64( intermed, gh, ef );
+        gh = vaddq_u64( gh, intermed );
+
+        /* Rounds 6 and 7 */
+        initial_sum = vaddq_u64( s3, vld1q_u64( &K[6] ) );
+        sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab );
+        intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) );
+        ab = vsha512h2q_u64( intermed, ef, cd );
+        ef = vaddq_u64( ef, intermed );
+
+        /* Rounds 8 and 9 */
+        initial_sum = vaddq_u64( s4, vld1q_u64( &K[8] ) );
+        sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh );
+        intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) );
+        gh = vsha512h2q_u64( intermed, cd, ab );
+        cd = vaddq_u64( cd, intermed );
+
+        /* Rounds 10 and 11 */
+        initial_sum = vaddq_u64( s5, vld1q_u64( &K[10] ) );
+        sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef );
+        intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) );
+        ef = vsha512h2q_u64( intermed, ab, gh );
+        ab = vaddq_u64( ab, intermed );
+
+        /* Rounds 12 and 13 */
+        initial_sum = vaddq_u64( s6, vld1q_u64( &K[12] ) );
+        sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd );
+        intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) );
+        cd = vsha512h2q_u64( intermed, gh, ef );
+        gh = vaddq_u64( gh, intermed );
+
+        /* Rounds 14 and 15 */
+        initial_sum = vaddq_u64( s7, vld1q_u64( &K[14] ) );
+        sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab );
+        intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) );
+        ab = vsha512h2q_u64( intermed, ef, cd );
+        ef = vaddq_u64( ef, intermed );
+
+        for ( unsigned int t = 16; t < 80; t += 16 )
+        {
+            /* Rounds t and t + 1 */
+            s0 = vsha512su1q_u64( vsha512su0q_u64( s0, s1 ), s7, vextq_u64( s4, s5, 1 ) );
+            initial_sum = vaddq_u64( s0, vld1q_u64( &K[t] ) );
+            sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh );
+            intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) );
+            gh = vsha512h2q_u64( intermed, cd, ab );
+            cd = vaddq_u64( cd, intermed );
+
+            /* Rounds t + 2 and t + 3 */
+            s1 = vsha512su1q_u64( vsha512su0q_u64( s1, s2 ), s0, vextq_u64( s5, s6, 1 ) );
+            initial_sum = vaddq_u64( s1, vld1q_u64( &K[t + 2] ) );
+            sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef );
+            intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) );
+            ef = vsha512h2q_u64( intermed, ab, gh );
+            ab = vaddq_u64( ab, intermed );
+
+            /* Rounds t + 4 and t + 5 */
+            s2 = vsha512su1q_u64( vsha512su0q_u64( s2, s3 ), s1, vextq_u64( s6, s7, 1 ) );
+            initial_sum = vaddq_u64( s2, vld1q_u64( &K[t + 4] ) );
+            sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd );
+            intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) );
+            cd = vsha512h2q_u64( intermed, gh, ef );
+            gh = vaddq_u64( gh, intermed );
+
+            /* Rounds t + 6 and t + 7 */
+            s3 = vsha512su1q_u64( vsha512su0q_u64( s3, s4 ), s2, vextq_u64( s7, s0, 1 ) );
+            initial_sum = vaddq_u64( s3, vld1q_u64( &K[t + 6] ) );
+            sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab );
+            intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) );
+            ab = vsha512h2q_u64( intermed, ef, cd );
+            ef = vaddq_u64( ef, intermed );
+
+            /* Rounds t + 8 and t + 9 */
+            s4 = vsha512su1q_u64( vsha512su0q_u64( s4, s5 ), s3, vextq_u64( s0, s1, 1 ) );
+            initial_sum = vaddq_u64( s4, vld1q_u64( &K[t + 8] ) );
+            sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), gh );
+            intermed = vsha512hq_u64( sum, vextq_u64( ef, gh, 1 ), vextq_u64( cd, ef, 1 ) );
+            gh = vsha512h2q_u64( intermed, cd, ab );
+            cd = vaddq_u64( cd, intermed );
+
+            /* Rounds t + 10 and t + 11 */
+            s5 = vsha512su1q_u64( vsha512su0q_u64( s5, s6 ), s4, vextq_u64( s1, s2, 1 ) );
+            initial_sum = vaddq_u64( s5, vld1q_u64( &K[t + 10] ) );
+            sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ef );
+            intermed = vsha512hq_u64( sum, vextq_u64( cd, ef, 1 ), vextq_u64( ab, cd, 1 ) );
+            ef = vsha512h2q_u64( intermed, ab, gh );
+            ab = vaddq_u64( ab, intermed );
+
+            /* Rounds t + 12 and t + 13 */
+            s6 = vsha512su1q_u64( vsha512su0q_u64( s6, s7 ), s5, vextq_u64( s2, s3, 1 ) );
+            initial_sum = vaddq_u64( s6, vld1q_u64( &K[t + 12] ) );
+            sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), cd );
+            intermed = vsha512hq_u64( sum, vextq_u64( ab, cd, 1 ), vextq_u64( gh, ab, 1 ) );
+            cd = vsha512h2q_u64( intermed, gh, ef );
+            gh = vaddq_u64( gh, intermed );
+
+            /* Rounds t + 14 and t + 15 */
+            s7 = vsha512su1q_u64( vsha512su0q_u64( s7, s0 ), s6, vextq_u64( s3, s4, 1 ) );
+            initial_sum = vaddq_u64( s7, vld1q_u64( &K[t + 14] ) );
+            sum = vaddq_u64( vextq_u64( initial_sum, initial_sum, 1 ), ab );
+            intermed = vsha512hq_u64( sum, vextq_u64( gh, ab, 1 ), vextq_u64( ef, gh, 1 ) );
+            ab = vsha512h2q_u64( intermed, ef, cd );
+            ef = vaddq_u64( ef, intermed );
+        }
+
+        ab = vaddq_u64( ab, ab_orig );
+        cd = vaddq_u64( cd, cd_orig );
+        ef = vaddq_u64( ef, ef_orig );
+        gh = vaddq_u64( gh, gh_orig );
+    }
+
+    vst1q_u64( &ctx->state[0], ab );
+    vst1q_u64( &ctx->state[2], cd );
+    vst1q_u64( &ctx->state[4], ef );
+    vst1q_u64( &ctx->state[6], gh );
+
+    return( processed );
+}
+
+int mbedtls_internal_sha512_process_a64_crypto( mbedtls_sha512_context *ctx,
+                              const unsigned char data[SHA512_BLOCK_SIZE] )
+{
+    return( mbedtls_internal_sha512_process_many_a64_crypto( ctx, data,
+                SHA512_BLOCK_SIZE ) == SHA512_BLOCK_SIZE ) ? 0 : -1;
+}
+
+#endif /* MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT || MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY */
+
+
+#if !defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT)
+#define mbedtls_internal_sha512_process_many_c mbedtls_internal_sha512_process_many
+#define mbedtls_internal_sha512_process_c      mbedtls_internal_sha512_process
+#endif
+
+
+#if !defined(MBEDTLS_SHA512_PROCESS_ALT) && !defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+
+int mbedtls_internal_sha512_process_c( mbedtls_sha512_context *ctx,
+                                       const unsigned char data[SHA512_BLOCK_SIZE] )
 {
    int i;
    struct
@ -291,7 +647,68 @@ int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx,
    return( 0 );
 }

-#endif /* !MBEDTLS_SHA512_PROCESS_ALT */
+#endif /* !MBEDTLS_SHA512_PROCESS_ALT && !MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY */
+
+
+#if !defined(MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY)
+
+static size_t mbedtls_internal_sha512_process_many_c(
+                  mbedtls_sha512_context *ctx, const uint8_t *data, size_t len)
+{
+    size_t processed = 0;
+
+    while( len >= SHA512_BLOCK_SIZE )
+    {
+        if( mbedtls_internal_sha512_process_c( ctx, data ) != 0)
+            return( 0 );
+
+        data += SHA512_BLOCK_SIZE;
+        len  -= SHA512_BLOCK_SIZE;
+
+        processed += SHA512_BLOCK_SIZE;
+    }
+
+    return( processed );
+}
+
+#endif /* !MBEDTLS_SHA512_USE_A64_CRYPTO_ONLY */
+
+
+#if defined(MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT)
+
+int mbedtls_a64_crypto_sha512_has_support( void )
+{
+    static int done = 0;
+    static int supported = 0;
+
+    if( !done )
+    {
+        supported = mbedtls_a64_crypto_sha512_determine_support();
+        done = 1;
+    }
+
+    return( supported );
+}
+
+static size_t mbedtls_internal_sha512_process_many( mbedtls_sha512_context *ctx,
+                  const uint8_t *msg, size_t len )
+{
+    if( mbedtls_a64_crypto_sha512_has_support() )
+        return( mbedtls_internal_sha512_process_many_a64_crypto( ctx, msg, len ) );
+    else
+        return( mbedtls_internal_sha512_process_many_c( ctx, msg, len ) );
+}
+
+int mbedtls_internal_sha512_process( mbedtls_sha512_context *ctx,
+        const unsigned char data[SHA512_BLOCK_SIZE] )
+{
+    if( mbedtls_a64_crypto_sha512_has_support() )
+        return( mbedtls_internal_sha512_process_a64_crypto( ctx, data ) );
+    else
+        return( mbedtls_internal_sha512_process_c( ctx, data ) );
+}
+
+#endif /* MBEDTLS_SHA512_USE_A64_CRYPTO_IF_PRESENT */

 /*
 * SHA-512 process buffer
@ -311,7 +728,7 @@ int mbedtls_sha512_update( mbedtls_sha512_context *ctx,
        return( 0 );

    left = (unsigned int) (ctx->total[0] & 0x7F);
-    fill = 128 - left;
+    fill = SHA512_BLOCK_SIZE - left;

    ctx->total[0] += (uint64_t) ilen;

@ -330,13 +747,15 @@ int mbedtls_sha512_update( mbedtls_sha512_context *ctx,
        left = 0;
    }

-    while( ilen >= 128 )
+    while( ilen >= SHA512_BLOCK_SIZE )
    {
-        if( ( ret = mbedtls_internal_sha512_process( ctx, input ) ) != 0 )
-            return( ret );
+        size_t processed =
+            mbedtls_internal_sha512_process_many( ctx, input, ilen );
+        if( processed < SHA512_BLOCK_SIZE )
+            return( MBEDTLS_ERR_ERROR_GENERIC_ERROR );

-        input += 128;
-        ilen  -= 128;
+        input += processed;
+        ilen  -= processed;
    }

    if( ilen > 0 )
@ -373,7 +792,7 @@ int mbedtls_sha512_finish( mbedtls_sha512_context *ctx,
    else
    {
        /* We'll need an extra block */
-        memset( ctx->buffer + used, 0, 128 - used );
+        memset( ctx->buffer + used, 0, SHA512_BLOCK_SIZE - used );

        if( ( ret = mbedtls_internal_sha512_process( ctx, ctx->buffer ) ) != 0 )
            return( ret );