mirror of
https://github.com/xiph/opus.git
synced 2025-05-15 07:58:29 +00:00

clang exposes this intrinsic even in 32-bit mode, if targeting >= armv8, whereas gcc does not, see: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=95399 Signed-off-by: Jean-Marc Valin <jmvalin@jmvalin.ca>
473 lines
13 KiB
C
473 lines
13 KiB
C
/* Copyright (c) 2018 David Rowe
|
|
2018 Mozilla
|
|
2008-2011 Octasic Inc.
|
|
2012-2017 Jean-Marc Valin */
|
|
/*
|
|
Redistribution and use in source and binary forms, with or without
|
|
modification, are permitted provided that the following conditions
|
|
are met:
|
|
|
|
- Redistributions of source code must retain the above copyright
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
- Redistributions in binary form must reproduce the above copyright
|
|
notice, this list of conditions and the following disclaimer in the
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
|
|
``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
|
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
|
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR
|
|
CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
|
|
EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
|
|
PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
|
|
PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
|
|
LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
|
|
NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
|
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
*/
|
|
/* NEON support for ARM machines */
|
|
|
|
#ifndef VEC_NEON_H
|
|
#define VEC_NEON_H
|
|
|
|
#include <arm_neon.h>
|
|
#include "os_support.h"
|
|
|
|
#if defined(__arm__) && !defined(__aarch64__) && (__ARM_ARCH < 8 || !defined(__clang__))
|
|
/* Emulate vcvtnq_s32_f32() for ARMv7 Neon. */
|
|
static OPUS_INLINE int32x4_t vcvtnq_s32_f32(float32x4_t x) {
|
|
return vrshrq_n_s32(vcvtq_n_s32_f32(x, 8), 8);
|
|
}
|
|
|
|
static OPUS_INLINE int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
|
return vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), vpadd_s16(vget_low_s16(b), vget_high_s16(b)));
|
|
}
|
|
|
|
static OPUS_INLINE int16x8_t vmull_high_s8(int8x16_t a, int8x16_t b) {
|
|
return vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
|
}
|
|
#endif
|
|
|
|
#ifdef __ARM_FEATURE_FMA
|
|
/* If we can, force the compiler to use an FMA instruction rather than break
|
|
vmlaq_f32() into fmul/fadd. */
|
|
#define vmlaq_f32(a,b,c) vfmaq_f32(a,b,c)
|
|
#endif
|
|
|
|
#ifndef LPCNET_TEST
|
|
static inline float32x4_t exp4_approx(float32x4_t x) {
|
|
int32x4_t i;
|
|
float32x4_t xf;
|
|
|
|
x = vmaxq_f32(vminq_f32(x, vdupq_n_f32(88.f)), vdupq_n_f32(-88.f));
|
|
|
|
/* express exp(x) as exp2(x/log(2)), add 127 for the exponent later */
|
|
x = vmlaq_f32(vdupq_n_f32(127.f), x, vdupq_n_f32(1.44269504f));
|
|
|
|
/* split into integer and fractional parts */
|
|
i = vcvtq_s32_f32(x);
|
|
xf = vcvtq_f32_s32(i);
|
|
x = vsubq_f32(x, xf);
|
|
|
|
float32x4_t K0 = vdupq_n_f32(0.99992522f);
|
|
float32x4_t K1 = vdupq_n_f32(0.69583354f);
|
|
float32x4_t K2 = vdupq_n_f32(0.22606716f);
|
|
float32x4_t K3 = vdupq_n_f32(0.078024523f);
|
|
float32x4_t Y = vmlaq_f32(K0, x, vmlaq_f32(K1, x, vmlaq_f32(K2, K3, x)));
|
|
|
|
/* compute 2^i */
|
|
float32x4_t exponent = vreinterpretq_f32_s32(vshlq_n_s32(i, 23));
|
|
|
|
Y = vmulq_f32(Y, exponent);
|
|
return Y;
|
|
}
|
|
|
|
static inline float32x4_t tanh4_approx(float32x4_t X)
|
|
{
|
|
const float32x4_t N0 = vdupq_n_f32(952.52801514f);
|
|
const float32x4_t N1 = vdupq_n_f32(96.39235687f);
|
|
const float32x4_t N2 = vdupq_n_f32(0.60863042f);
|
|
const float32x4_t D0 = vdupq_n_f32(952.72399902f);
|
|
const float32x4_t D1 = vdupq_n_f32(413.36801147f);
|
|
const float32x4_t D2 = vdupq_n_f32(11.88600922f);
|
|
const float32x4_t max_out = vdupq_n_f32(1.f);
|
|
const float32x4_t min_out = vdupq_n_f32(-1.f);
|
|
float32x4_t X2, num, den;
|
|
X2 = vmulq_f32(X, X);
|
|
num = vmlaq_f32(N0, X2, vmlaq_f32(N1, N2, X2));
|
|
den = vmlaq_f32(D0, X2, vmlaq_f32(D1, D2, X2));
|
|
num = vmulq_f32(num, X);
|
|
den = vrecpeq_f32(den);
|
|
num = vmulq_f32(num, den);
|
|
return vmaxq_f32(min_out, vminq_f32(max_out, num));
|
|
}
|
|
|
|
static inline float32x4_t sigmoid4_approx(float32x4_t X)
|
|
{
|
|
const float32x4_t N0 = vdupq_n_f32(238.13200378f);
|
|
const float32x4_t N1 = vdupq_n_f32(6.02452230f);
|
|
const float32x4_t N2 = vdupq_n_f32(0.00950985f);
|
|
const float32x4_t D0 = vdupq_n_f32(952.72399902f);
|
|
const float32x4_t D1 = vdupq_n_f32(103.34200287f);
|
|
const float32x4_t D2 = vdupq_n_f32(0.74287558f);
|
|
const float32x4_t half = vdupq_n_f32(0.5f);
|
|
const float32x4_t max_out = vdupq_n_f32(1.f);
|
|
const float32x4_t min_out = vdupq_n_f32(0.f);
|
|
float32x4_t X2, num, den;
|
|
X2 = vmulq_f32(X, X);
|
|
num = vmlaq_f32(N0, X2, vmlaq_f32(N1, N2, X2));
|
|
den = vmlaq_f32(D0, X2, vmlaq_f32(D1, D2, X2));
|
|
num = vmulq_f32(num, X);
|
|
den = vrecpeq_f32(den);
|
|
num = vmlaq_f32(half, num, den);
|
|
return vmaxq_f32(min_out, vminq_f32(max_out, num));
|
|
}
|
|
|
|
static inline float lpcnet_exp(float x)
|
|
{
|
|
float out[4];
|
|
float32x4_t X, Y;
|
|
X = vdupq_n_f32(x);
|
|
Y = exp4_approx(X);
|
|
vst1q_f32(out, Y);
|
|
return out[0];
|
|
}
|
|
|
|
static inline float tanh_approx(float x)
|
|
{
|
|
float out[4];
|
|
float32x4_t X, Y;
|
|
X = vdupq_n_f32(x);
|
|
Y = tanh4_approx(X);
|
|
vst1q_f32(out, Y);
|
|
return out[0];
|
|
}
|
|
|
|
static inline float sigmoid_approx(float x)
|
|
{
|
|
float out[4];
|
|
float32x4_t X, Y;
|
|
X = vdupq_n_f32(x);
|
|
Y = sigmoid4_approx(X);
|
|
vst1q_f32(out, Y);
|
|
return out[0];
|
|
}
|
|
|
|
static inline void softmax(float *y, const float *x, int N)
|
|
{
|
|
int i;
|
|
for (i=0;i<N-3;i+=4)
|
|
{
|
|
float32x4_t X, Y;
|
|
X = vld1q_f32(&x[i]);
|
|
Y = exp4_approx(X);
|
|
vst1q_f32(&y[i], Y);
|
|
}
|
|
for (;i<N;i++)
|
|
y[i] = lpcnet_exp(x[i]);
|
|
}
|
|
|
|
static inline void vec_tanh(float *y, const float *x, int N)
|
|
{
|
|
int i;
|
|
for (i=0;i<N-3;i+=4)
|
|
{
|
|
float32x4_t X, Y;
|
|
X = vld1q_f32(&x[i]);
|
|
Y = tanh4_approx(X);
|
|
vst1q_f32(&y[i], Y);
|
|
}
|
|
for (;i<N;i++)
|
|
{
|
|
float ex2;
|
|
ex2 = lpcnet_exp(2*x[i]);
|
|
y[i] = (ex2-1)/(ex2+1);
|
|
}
|
|
}
|
|
|
|
static inline void vec_sigmoid(float *y, const float *x, int N)
|
|
{
|
|
int i;
|
|
for (i=0;i<N-3;i+=4)
|
|
{
|
|
float32x4_t X, Y;
|
|
X = vld1q_f32(&x[i]);
|
|
Y = sigmoid4_approx(X);
|
|
vst1q_f32(&y[i], Y);
|
|
}
|
|
for (;i<N;i++)
|
|
{
|
|
float ex;
|
|
ex = lpcnet_exp(x[i]);
|
|
y[i] = (ex)/(ex+1);
|
|
}
|
|
}
|
|
#endif
|
|
|
|
static inline void sgemv16x1(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
|
|
{
|
|
int i, j;
|
|
for (i=0;i<rows;i+=16)
|
|
{
|
|
float * restrict y = &out[i];
|
|
|
|
/* keep y[0..15] in registers for duration of inner loop */
|
|
|
|
float32x4_t y0_3 = vdupq_n_f32(0);
|
|
float32x4_t y4_7 = vdupq_n_f32(0);
|
|
float32x4_t y8_11 = vdupq_n_f32(0);
|
|
float32x4_t y12_15 = vdupq_n_f32(0);
|
|
|
|
for (j=0;j<cols;j++)
|
|
{
|
|
const float * restrict w;
|
|
float32x4_t wvec0_3, wvec4_7, wvec8_11, wvec12_15;
|
|
float32x4_t xj;
|
|
|
|
w = &weights[j*col_stride + i];
|
|
wvec0_3 = vld1q_f32(&w[0]);
|
|
wvec4_7 = vld1q_f32(&w[4]);
|
|
wvec8_11 = vld1q_f32(&w[8]);
|
|
wvec12_15 = vld1q_f32(&w[12]);
|
|
|
|
xj = vld1q_dup_f32(&x[j]);
|
|
|
|
y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);
|
|
y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);
|
|
y8_11 = vmlaq_f32(y8_11, wvec8_11, xj);
|
|
y12_15 = vmlaq_f32(y12_15, wvec12_15, xj);
|
|
}
|
|
|
|
/* save y[0..15] back to memory */
|
|
|
|
vst1q_f32(&y[0], y0_3);
|
|
vst1q_f32(&y[4], y4_7);
|
|
vst1q_f32(&y[8], y8_11);
|
|
vst1q_f32(&y[12], y12_15);
|
|
|
|
}
|
|
}
|
|
|
|
static inline void sgemv8x1(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
|
|
{
|
|
int i, j;
|
|
for (i=0;i<rows;i+=8)
|
|
{
|
|
float * restrict y = &out[i];
|
|
|
|
/* keep y[0..15] in registers for duration of inner loop */
|
|
|
|
float32x4_t y0_3 = vdupq_n_f32(0);
|
|
float32x4_t y4_7 = vdupq_n_f32(0);
|
|
|
|
for (j=0;j<cols;j++)
|
|
{
|
|
const float * restrict w;
|
|
float32x4_t wvec0_3, wvec4_7;
|
|
float32x4_t xj;
|
|
|
|
w = &weights[j*col_stride + i];
|
|
wvec0_3 = vld1q_f32(&w[0]);
|
|
wvec4_7 = vld1q_f32(&w[4]);
|
|
|
|
xj = vld1q_dup_f32(&x[j]);
|
|
|
|
y0_3 = vmlaq_f32(y0_3, wvec0_3, xj);
|
|
y4_7 = vmlaq_f32(y4_7, wvec4_7, xj);
|
|
}
|
|
|
|
/* save y[0..15] back to memory */
|
|
|
|
vst1q_f32(&y[0], y0_3);
|
|
vst1q_f32(&y[4], y4_7);
|
|
}
|
|
}
|
|
|
|
static inline void sgemv(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
|
|
{
|
|
if ((rows&0xf) == 0) sgemv16x1(out, weights, rows, cols, col_stride, x);
|
|
else if ((rows&0x7) == 0) sgemv8x1(out, weights, rows, cols, col_stride, x);
|
|
else {
|
|
int i, j;
|
|
for (i=0;i<rows;i++)
|
|
{
|
|
out[i] = 0;
|
|
for (j=0;j<cols;j++) out[i] += weights[j*col_stride + i]*x[j];
|
|
}
|
|
}
|
|
}
|
|
|
|
/* Temporarily use unoptimized version */
|
|
static inline void sparse_sgemv8x4(float *out, const float *w, const int *idx, int rows, const float *x)
|
|
{
|
|
int i, j;
|
|
OPUS_CLEAR(out, rows);
|
|
for (i=0;i<rows;i+=8)
|
|
{
|
|
int cols;
|
|
cols = *idx++;
|
|
for (j=0;j<cols;j++)
|
|
{
|
|
int pos;
|
|
float * restrict y;
|
|
float xj0, xj1, xj2, xj3;
|
|
pos = (*idx++);
|
|
xj0 = x[pos+0];
|
|
xj1 = x[pos+1];
|
|
xj2 = x[pos+2];
|
|
xj3 = x[pos+3];
|
|
y = &out[i];
|
|
y[0] += w[0]*xj0;
|
|
y[1] += w[1]*xj0;
|
|
y[2] += w[2]*xj0;
|
|
y[3] += w[3]*xj0;
|
|
y[4] += w[4]*xj0;
|
|
y[5] += w[5]*xj0;
|
|
y[6] += w[6]*xj0;
|
|
y[7] += w[7]*xj0;
|
|
|
|
y[0] += w[8]*xj1;
|
|
y[1] += w[9]*xj1;
|
|
y[2] += w[10]*xj1;
|
|
y[3] += w[11]*xj1;
|
|
y[4] += w[12]*xj1;
|
|
y[5] += w[13]*xj1;
|
|
y[6] += w[14]*xj1;
|
|
y[7] += w[15]*xj1;
|
|
|
|
y[0] += w[16]*xj2;
|
|
y[1] += w[17]*xj2;
|
|
y[2] += w[18]*xj2;
|
|
y[3] += w[19]*xj2;
|
|
y[4] += w[20]*xj2;
|
|
y[5] += w[21]*xj2;
|
|
y[6] += w[22]*xj2;
|
|
y[7] += w[23]*xj2;
|
|
|
|
y[0] += w[24]*xj3;
|
|
y[1] += w[25]*xj3;
|
|
y[2] += w[26]*xj3;
|
|
y[3] += w[27]*xj3;
|
|
y[4] += w[28]*xj3;
|
|
y[5] += w[29]*xj3;
|
|
y[6] += w[30]*xj3;
|
|
y[7] += w[31]*xj3;
|
|
w += 32;
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#define SCALE (128.f*127.f)
|
|
#define SCALE_1 (1.f/128.f/127.f)
|
|
|
|
#define MAX_INPUTS 2048
|
|
#define MAX_OUTPUTS 8192
|
|
|
|
#if __ARM_FEATURE_DOTPROD
|
|
static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
|
return vdotq_s32(acc, a, b);
|
|
}
|
|
#else
|
|
static inline int32x4_t vdotprod(int32x4_t acc, int8x16_t a, int8x16_t b)
|
|
{
|
|
return vpadalq_s16(acc, vpaddq_s16(vmull_s8(vget_low_s8(a), vget_low_s8(b)), vmull_high_s8(a, b)));
|
|
}
|
|
#endif
|
|
|
|
static inline void cgemv8x4(float *_out, const opus_int8 *w, const float *scale, int rows, int cols, const float *_x)
|
|
{
|
|
int i, j;
|
|
opus_int32 x_int[MAX_INPUTS/4];
|
|
opus_int8 *x = (opus_int8*) x_int;
|
|
const float32x4_t const127 = vdupq_n_f32(127.);
|
|
for (i=0;i<cols;i+=8) {
|
|
int32x4_t xi0, xi4;
|
|
int16x8_t x_short;
|
|
xi0 = vcvtnq_s32_f32(vmulq_f32(const127, vld1q_f32(&_x[i])));
|
|
xi4 = vcvtnq_s32_f32(vmulq_f32(const127, vld1q_f32(&_x[i+4])));
|
|
x_short = vcombine_s16(vmovn_s32(xi0), vmovn_s32(xi4));
|
|
vst1_s8(&x[i], vmovn_s16(x_short));
|
|
}
|
|
for (i=0;i<rows;i+=8)
|
|
{
|
|
int32x4_t acc0, acc1;
|
|
int32x4_t acc2, acc3;
|
|
acc0 = vdupq_n_s32(0);
|
|
acc1 = vdupq_n_s32(0);
|
|
acc2 = vdupq_n_s32(0);
|
|
acc3 = vdupq_n_s32(0);
|
|
j=0;
|
|
for (;j<cols-4;j+=8)
|
|
{
|
|
int8x16_t vw0, vw1, vw2, vw3, vx0, vx1;
|
|
vx0 = (int8x16_t)vld1q_dup_s32((int*)(void*)&x[j]);
|
|
vw0 = vld1q_s8(w);
|
|
vw1 = vld1q_s8(&w[16]);
|
|
acc0 = vdotprod(acc0, vw0, vx0);
|
|
acc1 = vdotprod(acc1, vw1, vx0);
|
|
vx1 = (int8x16_t)vld1q_dup_s32((int*)(void*)&x[j+4]);
|
|
vw2 = vld1q_s8(&w[32]);
|
|
vw3 = vld1q_s8(&w[48]);
|
|
acc2 = vdotprod(acc2, vw2, vx1);
|
|
acc3 = vdotprod(acc3, vw3, vx1);
|
|
w += 64;
|
|
}
|
|
acc0 = vaddq_s32(acc0, acc2);
|
|
acc1 = vaddq_s32(acc1, acc3);
|
|
for (;j<cols;j+=4)
|
|
{
|
|
int8x16_t vw0, vw1, vx;
|
|
vx = (int8x16_t)vld1q_dup_s32((int*)(void*)&x[j]);
|
|
vw0 = vld1q_s8(w);
|
|
vw1 = vld1q_s8(&w[16]);
|
|
acc0 = vdotprod(acc0, vw0, vx);
|
|
acc1 = vdotprod(acc1, vw1, vx);
|
|
w += 32;
|
|
}
|
|
vst1q_f32(&_out[i], vmulq_f32(vld1q_f32(&scale[i]), vcvtq_f32_s32(acc0)));
|
|
vst1q_f32(&_out[i+4], vmulq_f32(vld1q_f32(&scale[i+4]), vcvtq_f32_s32(acc1)));
|
|
}
|
|
}
|
|
|
|
static inline void sparse_cgemv8x4(float *_out, const opus_int8 *w, const int *idx, const float *scale, int rows, int cols, const float *_x)
|
|
{
|
|
int i, j;
|
|
opus_int32 x_int[MAX_INPUTS/4];
|
|
opus_int8 *x = (opus_int8*) x_int;
|
|
const float32x4_t const127 = vdupq_n_f32(127.);
|
|
for (i=0;i<cols;i+=8) {
|
|
int32x4_t xi0, xi4;
|
|
int16x8_t x_short;
|
|
xi0 = vcvtnq_s32_f32(vmulq_f32(const127, vld1q_f32(&_x[i])));
|
|
xi4 = vcvtnq_s32_f32(vmulq_f32(const127, vld1q_f32(&_x[i+4])));
|
|
x_short = vcombine_s16(vmovn_s32(xi0), vmovn_s32(xi4));
|
|
vst1_s8(&x[i], vmovn_s16(x_short));
|
|
}
|
|
for (i=0;i<rows;i+=8)
|
|
{
|
|
int colblocks;
|
|
int32x4_t acc0, acc1;
|
|
acc0 = vdupq_n_s32(0);
|
|
acc1 = vdupq_n_s32(0);
|
|
colblocks = *idx++;
|
|
for (j=0;j<colblocks;j++)
|
|
{
|
|
int pos;
|
|
pos = (*idx++);
|
|
int8x16_t vw0, vw1, vx;
|
|
vx = (int8x16_t)vld1q_dup_s32((int*)(void*)&x[pos]);
|
|
vw0 = vld1q_s8(w);
|
|
vw1 = vld1q_s8(&w[16]);
|
|
acc0 = vdotprod(acc0, vw0, vx);
|
|
acc1 = vdotprod(acc1, vw1, vx);
|
|
w += 32;
|
|
}
|
|
vst1q_f32(&_out[i], vmulq_f32(vld1q_f32(&scale[i]), vcvtq_f32_s32(acc0)));
|
|
vst1q_f32(&_out[i+4], vmulq_f32(vld1q_f32(&scale[i+4]), vcvtq_f32_s32(acc1)));
|
|
}
|
|
}
|
|
|
|
|
|
#endif
|