mirror of
https://github.com/xiph/opus.git
synced 2025-05-14 15:38:32 +00:00
Arm: Speed up FLOAT2INT16 conversion with Neon
Using Neon for float to int conversion, and introducing platform- specific function for converting an array of float values to int16. Also adding appropriate unit test. Signed-off-by: Jean-Marc Valin <jeanmarcv@google.com>
This commit is contained in:
parent
edffe56b30
commit
d4494e6ed7
9 changed files with 277 additions and 4 deletions
|
@ -1,5 +1,6 @@
|
|||
/* Copyright (c) 2010 Xiph.Org Foundation
|
||||
* Copyright (c) 2013 Parrot */
|
||||
* Copyright (c) 2013 Parrot
|
||||
* Copyright (c) 2024 Arm Limited */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
|
@ -29,12 +30,25 @@
|
|||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "pitch.h"
|
||||
#include "kiss_fft.h"
|
||||
#include "mathops.h"
|
||||
#include "mdct.h"
|
||||
#include "pitch.h"
|
||||
|
||||
#if defined(OPUS_HAVE_RTCD)
|
||||
|
||||
# if !defined(DISABLE_FLOAT_API)
|
||||
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
|
||||
void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt) = {
|
||||
celt_float2int16_c, /* ARMv4 */
|
||||
celt_float2int16_c, /* EDSP */
|
||||
celt_float2int16_c, /* Media */
|
||||
celt_float2int16_neon,/* NEON */
|
||||
celt_float2int16_neon /* DOTPROD */
|
||||
};
|
||||
# endif
|
||||
# endif
|
||||
|
||||
# if defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR)
|
||||
opus_val32 (*const CELT_INNER_PROD_IMPL[OPUS_ARCHMASK+1])(const opus_val16 *x, const opus_val16 *y, int N) = {
|
||||
celt_inner_prod_c, /* ARMv4 */
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
/* Copyright (c) 2014-2015 Xiph.Org Foundation
|
||||
Copyright (c) 2024 Arm Limited
|
||||
Written by Viswanath Puttagunta */
|
||||
/**
|
||||
@file celt_neon_intr.c
|
||||
|
@ -35,7 +36,57 @@
|
|||
#endif
|
||||
|
||||
#include <arm_neon.h>
|
||||
#include "../float_cast.h"
|
||||
#include "../mathops.h"
|
||||
#include "../pitch.h"
|
||||
#if defined(OPUS_CHECK_ASM)
|
||||
#include <stdlib.h>
|
||||
#endif
|
||||
|
||||
#if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
|
||||
void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
|
||||
{
|
||||
int i = 0;
|
||||
|
||||
#if defined(__ARM_NEON)
|
||||
const int BLOCK_SIZE = 16;
|
||||
const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
|
||||
|
||||
for (; i < blockedSize; i += BLOCK_SIZE)
|
||||
{
|
||||
float32x4_t orig_a = vld1q_f32(&in[i + 0]);
|
||||
float32x4_t orig_b = vld1q_f32(&in[i + 4]);
|
||||
float32x4_t orig_c = vld1q_f32(&in[i + 8]);
|
||||
float32x4_t orig_d = vld1q_f32(&in[i + 12]);
|
||||
|
||||
int16x4_t asShort_a = vqmovn_s32(vroundf(vmulq_n_f32(orig_a, CELT_SIG_SCALE)));
|
||||
int16x4_t asShort_b = vqmovn_s32(vroundf(vmulq_n_f32(orig_b, CELT_SIG_SCALE)));
|
||||
int16x4_t asShort_c = vqmovn_s32(vroundf(vmulq_n_f32(orig_c, CELT_SIG_SCALE)));
|
||||
int16x4_t asShort_d = vqmovn_s32(vroundf(vmulq_n_f32(orig_d, CELT_SIG_SCALE)));
|
||||
|
||||
vst1_s16(&out[i + 0], asShort_a);
|
||||
vst1_s16(&out[i + 4], asShort_b);
|
||||
vst1_s16(&out[i + 8], asShort_c);
|
||||
vst1_s16(&out[i + 12], asShort_d);
|
||||
# if defined(OPUS_CHECK_ASM)
|
||||
short out_c[BLOCK_SIZE];
|
||||
int j;
|
||||
for(j = 0; j < BLOCK_SIZE; j++)
|
||||
{
|
||||
out_c[j] = FLOAT2INT16(in[i + j]);
|
||||
celt_assert(abs((out_c[j] - out[i + j])) <= 1);
|
||||
}
|
||||
# endif
|
||||
}
|
||||
#endif
|
||||
|
||||
for (; i < cnt; i++)
|
||||
{
|
||||
out[i] = FLOAT2INT16(in[i]);
|
||||
}
|
||||
}
|
||||
#endif
|
||||
|
||||
#if defined(FIXED_POINT)
|
||||
#include <string.h>
|
||||
|
|
65
celt/arm/mathops_arm.h
Normal file
65
celt/arm/mathops_arm.h
Normal file
|
@ -0,0 +1,65 @@
|
|||
/* Copyright (c) 2024 Arm Limited */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
modification, are permitted provided that the following conditions
|
||||
are met:
|
||||
|
||||
- Redistributions of source code must retain the above copyright
|
||||
notice, this list of conditions and the following disclaimer.
|
||||
|
||||
- Redistributions in binary form must reproduce the above copyright
|
||||
notice, this list of conditions and the following disclaimer in the
|
||||
documentation and/or other materials provided with the distribution.
|
||||
|
||||
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
||||
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||||
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||||
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
|
||||
OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
||||
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
||||
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
||||
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
||||
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
||||
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*/
|
||||
|
||||
#if !defined(MATHOPS_ARM_H)
|
||||
# define MATHOPS_ARM_H
|
||||
|
||||
#include "armcpu.h"
|
||||
#include "cpu_support.h"
|
||||
#include "opus_defines.h"
|
||||
|
||||
# if !defined(DISABLE_FLOAT_API) && defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
|
||||
#include <arm_neon.h>
|
||||
|
||||
static inline int32x4_t vroundf(float32x4_t x)
|
||||
{
|
||||
# if defined(__aarch64__) || (defined(__ARM_ARCH) && __ARM_ARCH >= 8)
|
||||
return vcvtaq_s32_f32(x);
|
||||
# else
|
||||
uint32x4_t sign = vandq_u32(vreinterpretq_u32_f32(x), vdupq_n_u32(0x80000000));
|
||||
uint32x4_t bias = vdupq_n_u32(0x3F000000);
|
||||
return vcvtq_s32_f32(vaddq_f32(x, vreinterpretq_f32_u32(vorrq_u32(bias, sign))));
|
||||
# endif
|
||||
}
|
||||
|
||||
void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
|
||||
# if defined(OPUS_HAVE_RTCD) && \
|
||||
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
|
||||
extern void
|
||||
(*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
|
||||
|
||||
# define OVERRIDE_FLOAT2INT16 (1)
|
||||
# define celt_float2int16(in, out, cnt, arch) \
|
||||
((*CELT_FLOAT2INT16_IMPL[(arch)&OPUS_ARCHMASK])(in, out, cnt))
|
||||
|
||||
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
|
||||
# define OVERRIDE_FLOAT2INT16 (1)
|
||||
# define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
|
||||
# endif
|
||||
# endif
|
||||
|
||||
#endif /* MATHOPS_ARM_H */
|
|
@ -98,6 +98,13 @@ static OPUS_INLINE opus_int32 float2int(float x) {return _mm_cvt_ss2si(_mm_set_s
|
|||
|
||||
return intgr ;
|
||||
}
|
||||
#elif defined(__aarch64__)
|
||||
|
||||
#include <arm_neon.h>
|
||||
static OPUS_INLINE opus_int32 float2int(float flt)
|
||||
{
|
||||
return vcvtns_s32_f32(flt);
|
||||
}
|
||||
|
||||
#elif defined(HAVE_LRINTF) && defined(__STDC_VERSION__) && __STDC_VERSION__ >= 199901L
|
||||
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/* Copyright (c) 2002-2008 Jean-Marc Valin
|
||||
Copyright (c) 2007-2008 CSIRO
|
||||
Copyright (c) 2007-2009 Xiph.Org Foundation
|
||||
Copyright (c) 2024 Arm Limited
|
||||
Written by Jean-Marc Valin */
|
||||
/**
|
||||
@file mathops.h
|
||||
|
@ -35,6 +36,7 @@
|
|||
#include "config.h"
|
||||
#endif
|
||||
|
||||
#include "float_cast.h"
|
||||
#include "mathops.h"
|
||||
|
||||
/*Compute floor(sqrt(_val)) with exact arithmetic.
|
||||
|
@ -215,3 +217,16 @@ opus_val32 celt_rcp(opus_val32 x)
|
|||
}
|
||||
|
||||
#endif
|
||||
|
||||
#ifndef DISABLE_FLOAT_API
|
||||
|
||||
void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt)
|
||||
{
|
||||
int i;
|
||||
for (i = 0; i < cnt; i++)
|
||||
{
|
||||
out[i] = FLOAT2INT16(in[i]);
|
||||
}
|
||||
}
|
||||
|
||||
#endif /* DISABLE_FLOAT_API */
|
||||
|
|
|
@ -1,6 +1,7 @@
|
|||
/* Copyright (c) 2002-2008 Jean-Marc Valin
|
||||
Copyright (c) 2007-2008 CSIRO
|
||||
Copyright (c) 2007-2009 Xiph.Org Foundation
|
||||
Copyright (c) 2024 Arm Limited
|
||||
Written by Jean-Marc Valin, and Yunho Huh */
|
||||
/**
|
||||
@file mathops.h
|
||||
|
@ -38,6 +39,10 @@
|
|||
#include "entcode.h"
|
||||
#include "os_support.h"
|
||||
|
||||
#if defined(OPUS_ARM_MAY_HAVE_NEON_INTR)
|
||||
#include "arm/mathops_arm.h"
|
||||
#endif
|
||||
|
||||
#define PI 3.141592653f
|
||||
|
||||
/* Multiplies two 16-bit fractional values. Bit-exactness of this macro is important */
|
||||
|
@ -476,4 +481,15 @@ static OPUS_INLINE opus_val16 celt_atan2p(opus_val16 y, opus_val16 x)
|
|||
}
|
||||
|
||||
#endif /* FIXED_POINT */
|
||||
|
||||
#ifndef DISABLE_FLOAT_API
|
||||
|
||||
void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
|
||||
|
||||
#ifndef OVERRIDE_FLOAT2INT16
|
||||
#define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))
|
||||
#endif
|
||||
|
||||
#endif /* DISABLE_FLOAT_API */
|
||||
|
||||
#endif /* MATHOPS_H */
|
||||
|
|
|
@ -1,5 +1,6 @@
|
|||
/* Copyright (c) 2008-2011 Xiph.Org Foundation, Mozilla Corporation,
|
||||
Gregory Maxwell
|
||||
Copyright (c) 2024 Arm Limited
|
||||
Written by Jean-Marc Valin, Gregory Maxwell, Timothy B. Terriberry,
|
||||
and Yunho Huh */
|
||||
/*
|
||||
|
@ -37,8 +38,10 @@
|
|||
|
||||
#include <stdio.h>
|
||||
#include <math.h>
|
||||
#include "mathops.h"
|
||||
#include "bands.h"
|
||||
#include "cpu_support.h"
|
||||
#include "float_cast.h"
|
||||
#include "mathops.h"
|
||||
|
||||
#ifdef FIXED_POINT
|
||||
#define WORD "%d"
|
||||
|
@ -351,8 +354,94 @@ void testilog2(void)
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
#ifndef DISABLE_FLOAT_API
|
||||
|
||||
void testcelt_float2int16(int use_ref_impl, int buffer_size)
|
||||
{
|
||||
|
||||
#define MAX_BUFFER_SIZE 2080
|
||||
int i, cnt;
|
||||
float floatsToConvert[MAX_BUFFER_SIZE];
|
||||
short results[MAX_BUFFER_SIZE] = { 0 };
|
||||
float scaleInt16RangeTo01;
|
||||
|
||||
celt_assert(buffer_size <= MAX_BUFFER_SIZE);
|
||||
|
||||
scaleInt16RangeTo01 = 1.f / 32768.f;
|
||||
cnt = 0;
|
||||
|
||||
while (cnt + 15 < buffer_size && cnt < buffer_size / 2)
|
||||
{
|
||||
floatsToConvert[cnt++] = 77777.0f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = 33000.0f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = 32768.0f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = 32767.4f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = 32766.6f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = .501 * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = .499f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = .0f;
|
||||
floatsToConvert[cnt++] = -.499f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = -.501f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = -32767.6f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = -32768.4f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = -32769.0f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = -33000.0f * scaleInt16RangeTo01;
|
||||
floatsToConvert[cnt++] = -77777.0f * scaleInt16RangeTo01;
|
||||
|
||||
celt_assert(cnt < buffer_size);
|
||||
}
|
||||
|
||||
while (cnt < buffer_size)
|
||||
{
|
||||
float inInt16Range = cnt * 7 + .5;
|
||||
inInt16Range += (cnt & 0x01) ? .1 : -.1;
|
||||
inInt16Range *= (cnt & 0x02) ? 1 : -1;
|
||||
floatsToConvert[cnt++] = inInt16Range * scaleInt16RangeTo01;
|
||||
}
|
||||
|
||||
for (i = 0; i < MAX_BUFFER_SIZE; ++i)
|
||||
{
|
||||
results[i] = 42;
|
||||
}
|
||||
|
||||
if (use_ref_impl)
|
||||
{
|
||||
celt_float2int16_c(floatsToConvert, results, cnt);
|
||||
} else {
|
||||
celt_float2int16(floatsToConvert, results, cnt, opus_select_arch());
|
||||
}
|
||||
|
||||
for (i = 0; i < cnt; ++i)
|
||||
{
|
||||
const float expected = FLOAT2INT16(floatsToConvert[i]);
|
||||
if (results[i] != expected)
|
||||
{
|
||||
fprintf (stderr, "testcelt_float2int16 failed: celt_float2int16 converted %f (index: %d) to %d (x*32768=%f, expected: %d, cnt: %d, ref: %d)\n",
|
||||
floatsToConvert[i], i, (int)results[i], floatsToConvert[i] * 32768.0f, (int)expected, buffer_size, use_ref_impl);
|
||||
ret = 1;
|
||||
}
|
||||
}
|
||||
|
||||
for (i = cnt; i < MAX_BUFFER_SIZE; ++i)
|
||||
{
|
||||
if (results[i] != 42)
|
||||
{
|
||||
fprintf (stderr, "testcelt_float2int16 failed: buffer overflow (cnt: %d, ref: %d)\n", buffer_size, use_ref_impl);
|
||||
ret = 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
#undef MAX_BUFFER_SIZE
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
int main(void)
|
||||
{
|
||||
int i;
|
||||
int use_ref_impl[2] = { 0, 1 };
|
||||
|
||||
testbitexactcos();
|
||||
testbitexactlog2tan();
|
||||
testdiv();
|
||||
|
@ -364,6 +453,15 @@ int main(void)
|
|||
testilog2();
|
||||
testlog2_db();
|
||||
testexp2_db();
|
||||
#endif
|
||||
#ifndef DISABLE_FLOAT_API
|
||||
for (i = 0; i <= 1; ++i)
|
||||
{
|
||||
testcelt_float2int16(use_ref_impl[i], 1);
|
||||
testcelt_float2int16(use_ref_impl[i], 32);
|
||||
testcelt_float2int16(use_ref_impl[i], 127);
|
||||
testcelt_float2int16(use_ref_impl[i], 1031);
|
||||
}
|
||||
#endif
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -39,6 +39,7 @@ celt/arm/fixed_armv5e.h \
|
|||
celt/arm/fixed_arm64.h \
|
||||
celt/arm/kiss_fft_armv4.h \
|
||||
celt/arm/kiss_fft_armv5e.h \
|
||||
celt/arm/mathops_arm.h \
|
||||
celt/arm/pitch_arm.h \
|
||||
celt/arm/fft_arm.h \
|
||||
celt/arm/mdct_arm.h \
|
||||
|
|
|
@ -1,4 +1,5 @@
|
|||
/* Copyright (c) 2010 Xiph.Org Foundation, Skype Limited
|
||||
Copyright (c) 2024 Arm Limited
|
||||
Written by Jean-Marc Valin and Koen Vos */
|
||||
/*
|
||||
Redistribution and use in source and binary forms, with or without
|
||||
|
@ -835,7 +836,7 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
|
|||
opus_int32 len, opus_int16 *pcm, int frame_size, int decode_fec)
|
||||
{
|
||||
VARDECL(opus_res, out);
|
||||
int ret, i;
|
||||
int ret;
|
||||
int nb_samples;
|
||||
ALLOC_STACK;
|
||||
|
||||
|
@ -858,8 +859,13 @@ int opus_decode(OpusDecoder *st, const unsigned char *data,
|
|||
ret = opus_decode_native(st, data, len, out, frame_size, decode_fec, 0, NULL, OPTIONAL_CLIP, NULL, 0);
|
||||
if (ret > 0)
|
||||
{
|
||||
# if defined(FIXED_POINT)
|
||||
int i;
|
||||
for (i=0;i<ret*st->channels;i++)
|
||||
pcm[i] = RES2INT16(out[i]);
|
||||
# else
|
||||
celt_float2int16(out, pcm, ret*st->channels, st->arch);
|
||||
# endif
|
||||
}
|
||||
RESTORE_STACK;
|
||||
return ret;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue