mirror of
https://github.com/xiph/opus.git
synced 2025-05-16 16:38:30 +00:00
moving code around
This commit is contained in:
parent
05f4851dcd
commit
c395a68b7d
1 changed files with 75 additions and 76 deletions
151
dnn/nnet.c
151
dnn/nnet.c
|
@ -132,7 +132,73 @@ static void vec_sigmoid(float *y, const float *x, int N)
|
|||
y[i] = (ex)/(ex+1);
|
||||
}
|
||||
}
|
||||
#else
|
||||
|
||||
static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
|
||||
{
|
||||
int i, j;
|
||||
for (i=0;i<rows;i+=16)
|
||||
{
|
||||
float * restrict y;
|
||||
__m256 vy0, vy8;
|
||||
y = &out[i];
|
||||
vy0 = _mm256_loadu_ps(&y[0]);
|
||||
vy8 = _mm256_loadu_ps(&y[8]);
|
||||
for (j=0;j<cols;j++)
|
||||
{
|
||||
__m256 vxj;
|
||||
__m256 vw;
|
||||
vxj = _mm256_broadcast_ss(&x[j]);
|
||||
|
||||
vw = _mm256_loadu_ps(&weights[j*col_stride + i]);
|
||||
vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
|
||||
|
||||
vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);
|
||||
vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
|
||||
}
|
||||
_mm256_storeu_ps (&y[0], vy0);
|
||||
_mm256_storeu_ps (&y[8], vy8);
|
||||
}
|
||||
}
|
||||
static void sparse_gemm_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
|
||||
{
|
||||
int i, j;
|
||||
for (i=0;i<rows;i+=16)
|
||||
{
|
||||
float * restrict y;
|
||||
int cols;
|
||||
__m256 vy0, vy8;
|
||||
y = &out[i];
|
||||
vy0 = _mm256_loadu_ps(&y[0]);
|
||||
vy8 = _mm256_loadu_ps(&y[8]);
|
||||
cols = *idx++;
|
||||
for (j=0;j<cols;j++)
|
||||
{
|
||||
int id;
|
||||
__m256 vxj;
|
||||
__m256 vw;
|
||||
id = *idx++;
|
||||
vxj = _mm256_broadcast_ss(&x[id]);
|
||||
|
||||
vw = _mm256_loadu_ps(&weights[0]);
|
||||
vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
|
||||
|
||||
vw = _mm256_loadu_ps(&weights[8]);
|
||||
vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
|
||||
weights += 16;
|
||||
}
|
||||
_mm256_storeu_ps (&y[0], vy0);
|
||||
_mm256_storeu_ps (&y[8], vy8);
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
#else /* No AVX2/FMA support */
|
||||
|
||||
|
||||
#warning Compiling without any vectorization. This code will be very slow
|
||||
#warning Try adding -mavx2 -mfma
|
||||
|
||||
|
||||
static float celt_exp2(float x)
|
||||
{
|
||||
int integer;
|
||||
|
@ -211,81 +277,6 @@ static void vec_sigmoid(float *y, const float *x, int N)
|
|||
}
|
||||
}
|
||||
|
||||
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
static OPUS_INLINE float relu(float x)
|
||||
{
|
||||
return x < 0 ? 0 : x;
|
||||
}
|
||||
|
||||
#ifdef __AVX2__
|
||||
#include <immintrin.h>
|
||||
static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
|
||||
{
|
||||
int i, j;
|
||||
for (i=0;i<rows;i+=16)
|
||||
{
|
||||
float * restrict y;
|
||||
__m256 vy0, vy8;
|
||||
y = &out[i];
|
||||
vy0 = _mm256_loadu_ps(&y[0]);
|
||||
vy8 = _mm256_loadu_ps(&y[8]);
|
||||
for (j=0;j<cols;j++)
|
||||
{
|
||||
__m256 vxj;
|
||||
__m256 vw;
|
||||
vxj = _mm256_broadcast_ss(&x[j]);
|
||||
|
||||
vw = _mm256_loadu_ps(&weights[j*col_stride + i]);
|
||||
vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
|
||||
|
||||
vw = _mm256_loadu_ps(&weights[j*col_stride + i + 8]);
|
||||
vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
|
||||
}
|
||||
_mm256_storeu_ps (&y[0], vy0);
|
||||
_mm256_storeu_ps (&y[8], vy8);
|
||||
}
|
||||
}
|
||||
static void sparse_gemm_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
|
||||
{
|
||||
int i, j;
|
||||
for (i=0;i<rows;i+=16)
|
||||
{
|
||||
float * restrict y;
|
||||
int cols;
|
||||
__m256 vy0, vy8;
|
||||
y = &out[i];
|
||||
vy0 = _mm256_loadu_ps(&y[0]);
|
||||
vy8 = _mm256_loadu_ps(&y[8]);
|
||||
cols = *idx++;
|
||||
for (j=0;j<cols;j++)
|
||||
{
|
||||
int id;
|
||||
__m256 vxj;
|
||||
__m256 vw;
|
||||
id = *idx++;
|
||||
vxj = _mm256_broadcast_ss(&x[id]);
|
||||
|
||||
vw = _mm256_loadu_ps(&weights[0]);
|
||||
vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
|
||||
|
||||
vw = _mm256_loadu_ps(&weights[8]);
|
||||
vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
|
||||
weights += 16;
|
||||
}
|
||||
_mm256_storeu_ps (&y[0], vy0);
|
||||
_mm256_storeu_ps (&y[8], vy8);
|
||||
}
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
#warning Compiling without any vectorization. This code will be very slow
|
||||
#warning Try adding -mavx2 -mfma
|
||||
|
||||
static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
|
||||
{
|
||||
int i, j;
|
||||
|
@ -354,6 +345,14 @@ static void sparse_gemm_accum16(float *out, const float *w, int rows, const int
|
|||
}
|
||||
#endif
|
||||
|
||||
|
||||
|
||||
static OPUS_INLINE float relu(float x)
|
||||
{
|
||||
return x < 0 ? 0 : x;
|
||||
}
|
||||
|
||||
|
||||
static void gemm_accum(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
|
||||
{
|
||||
int i, j;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue