Arm: Speed up FLOAT2INT16 conversion with Neon

Using Neon for float to int conversion, and introducing platform- specific function for converting an array of float values to int16. Also adding appropriate unit test. Signed-off-by: Jean-Marc Valin <jeanmarcv@google.com>
2025-05-15 16:08:30 +00:00 · 2024-09-11 14:00:32 +02:00 · 2024-09-11 14:00:32 +02:00 · d4494e6ed7
commit d4494e6ed7
parent edffe56b30
9 changed files with 277 additions and 4 deletions
--- a/celt/arm/arm_celt_map.c
+++ b/celt/arm/arm_celt_map.c
@ -1,5 +1,6 @@
 /* Copyright (c) 2010 Xiph.Org Foundation
- * Copyright (c) 2013 Parrot */
+ * Copyright (c) 2013 Parrot
 * Copyright (c) 2024 Arm Limited */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
@ -29,12 +30,25 @@
 #include "config.h"
 #endif
 #include "pitch.h"
 #include "kiss_fft.h"
 #include "mathops.h"
 #include "mdct.h"
 #include "pitch.h"
 #if defined(OPUS_HAVE_RTCD)
 # if !defined(DISABLE_FLOAT_API)
 #  if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
 void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) = {
  celt_float2int16_c,   /* ARMv4 */
  celt_float2int16_c,   /* EDSP */
  celt_float2int16_c,   /* Media */
  celt_float2int16_neon,/* NEON */
  celt_float2int16_neon /* DOTPROD */
 };
 #  endif
 # endif
 # if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
 opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
  celt_inner_prod_c,   /* ARMv4 */
--- a/celt/arm/celt_neon_intr.c
+++ b/celt/arm/celt_neon_intr.c
@ -1,4 +1,5 @@
 /* Copyright (c) 2014-2015 Xiph.Org Foundation
   Copyright (c) 2024 Arm Limited
   Written by Viswanath Puttagunta */
 /**
   @file celt_neon_intr.c
@ -35,7 +36,57 @@
 #endif
 #include <arm_neon.h>
 #include "../float_cast.h"
 #include "../mathops.h"
 #include "../pitch.h"
 #if defined(OPUS_CHECK_ASM)
 #include <stdlib.h>
 #endif
 #if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
 {
   int i = 0;
 #if defined(__ARM_NEON)
   const int BLOCK_SIZE = 16;
   const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
   for (; i < blockedSize; i += BLOCK_SIZE)
   {
      float32x4_t orig_a = vld1q_f32(&in[i +  0]);
      float32x4_t orig_b = vld1q_f32(&in[i +  4]);
      float32x4_t orig_c = vld1q_f32(&in[i +  8]);
      float32x4_t orig_d = vld1q_f32(&in[i + 12]);
      int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE)));
      int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE)));
      int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE)));
      int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE)));
      vst1_s16(&out[i +  0], asShort_a);
      vst1_s16(&out[i +  4], asShort_b);
      vst1_s16(&out[i +  8], asShort_c);
      vst1_s16(&out[i + 12], asShort_d);
 # if defined(OPUS_CHECK_ASM)
      short out_c[BLOCK_SIZE];
      int j;
      for(j = 0; j < BLOCK_SIZE; j++)
      {
         out_c[j] = FLOAT2INT16(in[i + j]);
         celt_assert(abs((out_c[j] - out[i + j])) <= 1);
      }
 # endif
   }
 #endif
   for (; i < cnt; i++)
   {
      out[i] = FLOAT2INT16(in[i]);
   }
 }
 #endif
 #if defined(FIXED_POINT)
 #include <string.h>
--- a/celt/arm/mathops_arm.h
+++ b/celt/arm/mathops_arm.h
@ -0,0 +1,65 @@
 /* Copyright (c) 2024 Arm Limited */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #if !defined(MATHOPS_ARM_H)
 # define MATHOPS_ARM_H
 #include "armcpu.h"
 #include "cpu_support.h"
 #include "opus_defines.h"
 # if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #include <arm_neon.h>
 static inline int32x4_t vroundf(float32x4_t x)
 {
 #  if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
    return vcvtaq_s32_f32(x);
 #  else
    uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000));
    uint32x4_t bias = vdupq_n_u32(0x3F000000);
    return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign))));
 #  endif
 }
 void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
 #  if defined(OPUS_HAVE_RTCD) && \
    (defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
 extern void
 (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
 #   define OVERRIDE_FLOAT2INT16 (1)
 #   define celt_float2int16(in, out, cnt, arch) \
      ((*CELT_FLOAT2INT16_IMPL[(arch)&OPUS_ARCHMASK])(in, out, cnt))
 #  elif defined(OPUS_ARM_PRESUME_NEON_INTR)
 #   define OVERRIDE_FLOAT2INT16 (1)
 #   define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
 #  endif
 # endif
 #endif /* MATHOPS_ARM_H */
--- a/celt/float_cast.h
+++ b/celt/float_cast.h
@ -98,6 +98,13 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
                return intgr ;
        }
 #elif defined(__aarch64__)
        #include <arm_neon.h>
        static OPUS_INLINE opus_int32 float2int(float flt)
        {
                return vcvtns_s32_f32(flt);
        }
 #elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
--- a/celt/mathops.c
+++ b/celt/mathops.c
@ -1,6 +1,7 @@
 /* Copyright (c) 2002-2008 Jean-Marc Valin
   Copyright (c) 2007-2008 CSIRO
   Copyright (c) 2007-2009 Xiph.Org Foundation
   Copyright (c) 2024 Arm Limited
   Written by Jean-Marc Valin */
 /**
   @file mathops.h
@ -35,6 +36,7 @@
 #include "config.h"
 #endif
 #include "float_cast.h"
 #include "mathops.h"
 /*Compute floor(sqrt(_val)) with exact arithmetic.
@ -215,3 +217,16 @@ opus_val32 celt_rcp(opus_val32 x)
 }
 #endif
 #ifndef DISABLE_FLOAT_API
 void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
 {
   int i;
   for (i = 0; i < cnt; i++)
   {
      out[i] = FLOAT2INT16(in[i]);
   }
 }
 #endif /* DISABLE_FLOAT_API */
--- a/celt/mathops.h
+++ b/celt/mathops.h
@ -1,6 +1,7 @@
 /* Copyright (c) 2002-2008 Jean-Marc Valin
   Copyright (c) 2007-2008 CSIRO
   Copyright (c) 2007-2009 Xiph.Org Foundation
   Copyright (c) 2024 Arm Limited
   Written by Jean-Marc Valin, and Yunho Huh */
 /**
   @file mathops.h
@ -38,6 +39,10 @@
 #include "entcode.h"
 #include "os_support.h"
 #if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
 #include "arm/mathops_arm.h"
 #endif
 #define PI 3.141592653f
 /* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */
@ -476,4 +481,15 @@ static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x)
 }
 #endif /* FIXED_POINT */
 #ifndef DISABLE_FLOAT_API
 void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
 #ifndef OVERRIDE_FLOAT2INT16
 #define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))
 #endif
 #endif /* DISABLE_FLOAT_API */
 #endif /* MATHOPS_H */
--- a/celt/tests/test_unit_mathops.c
+++ b/celt/tests/test_unit_mathops.c
@ -1,5 +1,6 @@
 /* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation,
                           Gregory Maxwell
   Copyright (c) 2024 Arm Limited
   Written by Jean-Marc Valin, Gregory Maxwell, Timothy B. Terriberry,
   and Yunho Huh */
 /*
@ -37,8 +38,10 @@
 #include <stdio.h>
 #include <math.h>
 #include "mathops.h"
 #include "bands.h"
 #include "cpu_support.h"
 #include "float_cast.h"
 #include "mathops.h"
 #ifdef FIXED_POINT
 #define WORD "%d"
@ -351,8 +354,94 @@ void testilog2(void)
 }
 #endif
 #ifndef DISABLE_FLOAT_API
 void testcelt_float2int16(int use_ref_impl, int buffer_size)
 {
 #define MAX_BUFFER_SIZE 2080
   int i, cnt;
   float floatsToConvert[MAX_BUFFER_SIZE];
   short results[MAX_BUFFER_SIZE] = { 0 };
   float scaleInt16RangeTo01;
   celt_assert(buffer_size <= MAX_BUFFER_SIZE);
   scaleInt16RangeTo01 = 1.f / 32768.f;
   cnt = 0;
   while (cnt + 15 < buffer_size && cnt < buffer_size / 2)
   {
      floatsToConvert[cnt++] = 77777.0f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = 33000.0f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = 32768.0f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = 32767.4f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = 32766.6f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = .501 * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = .499f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = .0f;
      floatsToConvert[cnt++] = -.499f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = -.501f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = -32767.6f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = -32768.4f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = -32769.0f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = -33000.0f * scaleInt16RangeTo01;
      floatsToConvert[cnt++] = -77777.0f * scaleInt16RangeTo01;
      celt_assert(cnt < buffer_size);
   }
   while (cnt < buffer_size)
   {
      float inInt16Range = cnt * 7 + .5;
      inInt16Range += (cnt & 0x01) ? .1 : -.1;
      inInt16Range *= (cnt & 0x02) ? 1 : -1;
      floatsToConvert[cnt++] = inInt16Range * scaleInt16RangeTo01;
   }
   for (i = 0; i < MAX_BUFFER_SIZE; ++i)
   {
      results[i] = 42;
   }
   if (use_ref_impl)
   {
      celt_float2int16_c(floatsToConvert, results, cnt);
   } else {
      celt_float2int16(floatsToConvert, results, cnt, opus_select_arch());
   }
   for (i = 0; i < cnt; ++i)
   {
      const float expected = FLOAT2INT16(floatsToConvert[i]);
      if (results[i] != expected)
      {
         fprintf (stderr, "testcelt_float2int16 failed: celt_float2int16 converted %f (index: %d) to %d (x*32768=%f, expected: %d, cnt: %d, ref: %d)\n",
               floatsToConvert[i], i, (int)results[i], floatsToConvert[i] * 32768.0f, (int)expected, buffer_size, use_ref_impl);
         ret = 1;
      }
   }
   for (i = cnt; i < MAX_BUFFER_SIZE; ++i)
   {
      if (results[i] != 42)
      {
         fprintf (stderr, "testcelt_float2int16 failed: buffer overflow (cnt: %d, ref: %d)\n", buffer_size, use_ref_impl);
         ret = 1;
         break;
      }
   }
 #undef MAX_BUFFER_SIZE
 }
 #endif
 int main(void)
 {
   int i;
   int use_ref_impl[2] = { 0, 1 };
   testbitexactcos();
   testbitexactlog2tan();
   testdiv();
@ -364,6 +453,15 @@ int main(void)
   testilog2();
   testlog2_db();
   testexp2_db();
 #endif
 #ifndef DISABLE_FLOAT_API
   for (i = 0; i <= 1; ++i)
   {
      testcelt_float2int16(use_ref_impl[i], 1);
      testcelt_float2int16(use_ref_impl[i], 32);
      testcelt_float2int16(use_ref_impl[i], 127);
      testcelt_float2int16(use_ref_impl[i], 1031);
   }
 #endif
   return ret;
 }
--- a/celt_headers.mk
+++ b/celt_headers.mk
@ -39,6 +39,7 @@ celt/arm/fixed_armv5e.h \
 celt/arm/fixed_arm64.h \
 celt/arm/kiss_fft_armv4.h \
 celt/arm/kiss_fft_armv5e.h \
 celt/arm/mathops_arm.h \
 celt/arm/pitch_arm.h \
 celt/arm/fft_arm.h \
 celt/arm/mdct_arm.h \
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@ -1,4 +1,5 @@
 /* Copyright (c) 2010 Xiph.Org Foundation, Skype Limited
   Copyright (c) 2024 Arm Limited
   Written by Jean-Marc Valin and Koen Vos */
 /*
   Redistribution and use in source and binary forms, with or without
@ -835,7 +836,7 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
      opus_int32 len, opus_int16 *pcm, int frame_size, int decode_fec)
 {
       VARDECL(opus_res, out);
-       int ret, i;
+       int ret;
       int nb_samples;
       ALLOC_STACK;
@ -858,8 +859,13 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
       ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, OPTIONAL_CLIP, NULL, 0);
       if (ret > 0)
       {
 # if defined(FIXED_POINT)
          int i;
          for (i=0;i<ret*st->channels;i++)
             pcm[i] = RES2INT16(out[i]);
 # else
          celt_float2int16(out, pcm, ret*st->channels, st->arch);
 # endif
       }
       RESTORE_STACK;
       return ret;