Arm: Speed up FLOAT2INT16 conversion with Neon

Using Neon for float to int conversion, and introducing platform-
specific function for converting an array of float values to int16.
Also adding appropriate unit test.

Signed-off-by: Jean-Marc Valin <jeanmarcv@google.com>
This commit is contained in:
Sandor Zsombor Vegh 2024-09-11 14:00:32 +02:00 committed by Jean-Marc Valin
parent edffe56b30
commit d4494e6ed7
No known key found for this signature in database
GPG key ID: 8D2952BBB52C646D
9 changed files with 277 additions and 4 deletions

View file

@ -1,5 +1,6 @@
/* Copyright (c) 2010 Xiph.Org Foundation
* Copyright (c) 2013 Parrot */
* Copyright (c) 2013 Parrot
* Copyright (c) 2024 Arm Limited */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
@ -29,12 +30,25 @@
#include "config.h"
#endif
#include "pitch.h"
#include "kiss_fft.h"
#include "mathops.h"
#include "mdct.h"
#include "pitch.h"
#if defined(OPUS_HAVE_RTCD)
# if !defined(DISABLE_FLOAT_API)
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) = {
celt_float2int16_c, /* ARMv4 */
celt_float2int16_c, /* EDSP */
celt_float2int16_c, /* Media */
celt_float2int16_neon,/* NEON */
celt_float2int16_neon /* DOTPROD */
};
# endif
# endif
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
celt_inner_prod_c, /* ARMv4 */

View file

@ -1,4 +1,5 @@
/* Copyright (c) 2014-2015 Xiph.Org Foundation
Copyright (c) 2024 Arm Limited
Written by Viswanath Puttagunta */
/**
@file celt_neon_intr.c
@ -35,7 +36,57 @@
#endif
#include <arm_neon.h>
#include "../float_cast.h"
#include "../mathops.h"
#include "../pitch.h"
#if defined(OPUS_CHECK_ASM)
#include <stdlib.h>
#endif
#if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
{
int i = 0;
#if defined(__ARM_NEON)
const int BLOCK_SIZE = 16;
const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
for (; i < blockedSize; i += BLOCK_SIZE)
{
float32x4_t orig_a = vld1q_f32(&in[i + 0]);
float32x4_t orig_b = vld1q_f32(&in[i + 4]);
float32x4_t orig_c = vld1q_f32(&in[i + 8]);
float32x4_t orig_d = vld1q_f32(&in[i + 12]);
int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE)));
int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE)));
int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE)));
int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE)));
vst1_s16(&out[i + 0], asShort_a);
vst1_s16(&out[i + 4], asShort_b);
vst1_s16(&out[i + 8], asShort_c);
vst1_s16(&out[i + 12], asShort_d);
# if defined(OPUS_CHECK_ASM)
short out_c[BLOCK_SIZE];
int j;
for(j = 0; j < BLOCK_SIZE; j++)
{
out_c[j] = FLOAT2INT16(in[i + j]);
celt_assert(abs((out_c[j] - out[i + j])) <= 1);
}
# endif
}
#endif
for (; i < cnt; i++)
{
out[i] = FLOAT2INT16(in[i]);
}
}
#endif
#if defined(FIXED_POINT)
#include <string.h>

65
celt/arm/mathops_arm.h Normal file
View file

@ -0,0 +1,65 @@
/* Copyright (c) 2024 Arm Limited */
/*
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions
are met:
- Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
#if !defined(MATHOPS_ARM_H)
# define MATHOPS_ARM_H
#include "armcpu.h"
#include "cpu_support.h"
#include "opus_defines.h"
# if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
#include <arm_neon.h>
static inline int32x4_t vroundf(float32x4_t x)
{
# if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
return vcvtaq_s32_f32(x);
# else
uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000));
uint32x4_t bias = vdupq_n_u32(0x3F000000);
return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign))));
# endif
}
void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
# if defined(OPUS_HAVE_RTCD) && \
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
extern void
(*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
# define OVERRIDE_FLOAT2INT16 (1)
# define celt_float2int16(in, out, cnt, arch) \
((*CELT_FLOAT2INT16_IMPL[(arch)&OPUS_ARCHMASK])(in, out, cnt))
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
# define OVERRIDE_FLOAT2INT16 (1)
# define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
# endif
# endif
#endif /* MATHOPS_ARM_H */

View file

@ -98,6 +98,13 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
return intgr ;
}
#elif defined(__aarch64__)
#include <arm_neon.h>
static OPUS_INLINE opus_int32 float2int(float flt)
{
return vcvtns_s32_f32(flt);
}
#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L

View file

@ -1,6 +1,7 @@
/* Copyright (c) 2002-2008 Jean-Marc Valin
Copyright (c) 2007-2008 CSIRO
Copyright (c) 2007-2009 Xiph.Org Foundation
Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin */
/**
@file mathops.h
@ -35,6 +36,7 @@
#include "config.h"
#endif
#include "float_cast.h"
#include "mathops.h"
/*Compute floor(sqrt(_val)) with exact arithmetic.
@ -215,3 +217,16 @@ opus_val32 celt_rcp(opus_val32 x)
}
#endif
#ifndef DISABLE_FLOAT_API
void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
{
int i;
for (i = 0; i < cnt; i++)
{
out[i] = FLOAT2INT16(in[i]);
}
}
#endif /* DISABLE_FLOAT_API */

View file

@ -1,6 +1,7 @@
/* Copyright (c) 2002-2008 Jean-Marc Valin
Copyright (c) 2007-2008 CSIRO
Copyright (c) 2007-2009 Xiph.Org Foundation
Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin, and Yunho Huh */
/**
@file mathops.h
@ -38,6 +39,10 @@
#include "entcode.h"
#include "os_support.h"
#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
#include "arm/mathops_arm.h"
#endif
#define PI 3.141592653f
/* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */
@ -476,4 +481,15 @@ static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x)
}
#endif /* FIXED_POINT */
#ifndef DISABLE_FLOAT_API
void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
#ifndef OVERRIDE_FLOAT2INT16
#define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))
#endif
#endif /* DISABLE_FLOAT_API */
#endif /* MATHOPS_H */

View file

@ -1,5 +1,6 @@
/* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation,
Gregory Maxwell
Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin, Gregory Maxwell, Timothy B. Terriberry,
and Yunho Huh */
/*
@ -37,8 +38,10 @@
#include <stdio.h>
#include <math.h>
#include "mathops.h"
#include "bands.h"
#include "cpu_support.h"
#include "float_cast.h"
#include "mathops.h"
#ifdef FIXED_POINT
#define WORD "%d"
@ -351,8 +354,94 @@ void testilog2(void)
}
#endif
#ifndef DISABLE_FLOAT_API
void testcelt_float2int16(int use_ref_impl, int buffer_size)
{
#define MAX_BUFFER_SIZE 2080
int i, cnt;
float floatsToConvert[MAX_BUFFER_SIZE];
short results[MAX_BUFFER_SIZE] = { 0 };
float scaleInt16RangeTo01;
celt_assert(buffer_size <= MAX_BUFFER_SIZE);
scaleInt16RangeTo01 = 1.f / 32768.f;
cnt = 0;
while (cnt + 15 < buffer_size && cnt < buffer_size / 2)
{
floatsToConvert[cnt++] = 77777.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 33000.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 32768.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 32767.4f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = 32766.6f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = .501 * scaleInt16RangeTo01;
floatsToConvert[cnt++] = .499f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = .0f;
floatsToConvert[cnt++] = -.499f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -.501f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -32767.6f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -32768.4f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -32769.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -33000.0f * scaleInt16RangeTo01;
floatsToConvert[cnt++] = -77777.0f * scaleInt16RangeTo01;
celt_assert(cnt < buffer_size);
}
while (cnt < buffer_size)
{
float inInt16Range = cnt * 7 + .5;
inInt16Range += (cnt & 0x01) ? .1 : -.1;
inInt16Range *= (cnt & 0x02) ? 1 : -1;
floatsToConvert[cnt++] = inInt16Range * scaleInt16RangeTo01;
}
for (i = 0; i < MAX_BUFFER_SIZE; ++i)
{
results[i] = 42;
}
if (use_ref_impl)
{
celt_float2int16_c(floatsToConvert, results, cnt);
} else {
celt_float2int16(floatsToConvert, results, cnt, opus_select_arch());
}
for (i = 0; i < cnt; ++i)
{
const float expected = FLOAT2INT16(floatsToConvert[i]);
if (results[i] != expected)
{
fprintf (stderr, "testcelt_float2int16 failed: celt_float2int16 converted %f (index: %d) to %d (x*32768=%f, expected: %d, cnt: %d, ref: %d)\n",
floatsToConvert[i], i, (int)results[i], floatsToConvert[i] * 32768.0f, (int)expected, buffer_size, use_ref_impl);
ret = 1;
}
}
for (i = cnt; i < MAX_BUFFER_SIZE; ++i)
{
if (results[i] != 42)
{
fprintf (stderr, "testcelt_float2int16 failed: buffer overflow (cnt: %d, ref: %d)\n", buffer_size, use_ref_impl);
ret = 1;
break;
}
}
#undef MAX_BUFFER_SIZE
}
#endif
int main(void)
{
int i;
int use_ref_impl[2] = { 0, 1 };
testbitexactcos();
testbitexactlog2tan();
testdiv();
@ -364,6 +453,15 @@ int main(void)
testilog2();
testlog2_db();
testexp2_db();
#endif
#ifndef DISABLE_FLOAT_API
for (i = 0; i <= 1; ++i)
{
testcelt_float2int16(use_ref_impl[i], 1);
testcelt_float2int16(use_ref_impl[i], 32);
testcelt_float2int16(use_ref_impl[i], 127);
testcelt_float2int16(use_ref_impl[i], 1031);
}
#endif
return ret;
}

View file

@ -39,6 +39,7 @@ celt/arm/fixed_armv5e.h \
celt/arm/fixed_arm64.h \
celt/arm/kiss_fft_armv4.h \
celt/arm/kiss_fft_armv5e.h \
celt/arm/mathops_arm.h \
celt/arm/pitch_arm.h \
celt/arm/fft_arm.h \
celt/arm/mdct_arm.h \

View file

@ -1,4 +1,5 @@
/* Copyright (c) 2010 Xiph.Org Foundation, Skype Limited
Copyright (c) 2024 Arm Limited
Written by Jean-Marc Valin and Koen Vos */
/*
Redistribution and use in source and binary forms, with or without
@ -835,7 +836,7 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
opus_int32 len, opus_int16 *pcm, int frame_size, int decode_fec)
{
VARDECL(opus_res, out);
int ret, i;
int ret;
int nb_samples;
ALLOC_STACK;
@ -858,8 +859,13 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, OPTIONAL_CLIP, NULL, 0);
if (ret > 0)
{
# if defined(FIXED_POINT)
int i;
for (i=0;i<ret*st->channels;i++)
pcm[i] = RES2INT16(out[i]);
# else
celt_float2int16(out, pcm, ret*st->channels, st->arch);
# endif
}
RESTORE_STACK;
return ret;