WIP: signed*unsigned arithmetic
This commit is contained in:
parent
11736ca9e3
commit
bce779886d
6 changed files with 134 additions and 48 deletions
22
dnn/nnet.c
22
dnn/nnet.c
|
@ -39,16 +39,13 @@
|
||||||
#include "nnet.h"
|
#include "nnet.h"
|
||||||
#include "nnet_data.h"
|
#include "nnet_data.h"
|
||||||
|
|
||||||
|
#ifdef NO_OPTIMIZATIONS
|
||||||
|
#warning Compiling without any vectorization. This code will be very slow
|
||||||
|
#endif
|
||||||
|
|
||||||
|
|
||||||
#define SOFTMAX_HACK
|
#define SOFTMAX_HACK
|
||||||
|
|
||||||
#ifdef __AVX__
|
|
||||||
#include "vec_avx.h"
|
|
||||||
#elif __ARM_NEON__
|
|
||||||
#include "vec_neon.h"
|
|
||||||
#else
|
|
||||||
#warning Compiling without any vectorization. This code will be very slow
|
|
||||||
#include "vec.h"
|
|
||||||
#endif
|
|
||||||
|
|
||||||
static OPUS_INLINE float relu(float x)
|
static OPUS_INLINE float relu(float x)
|
||||||
{
|
{
|
||||||
|
@ -294,14 +291,19 @@ void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *in
|
||||||
celt_assert(input != state);
|
celt_assert(input != state);
|
||||||
celt_assert(gru->reset_after);
|
celt_assert(gru->reset_after);
|
||||||
RNN_COPY(zrh, input, 3*N);
|
RNN_COPY(zrh, input, 3*N);
|
||||||
|
#ifdef USE_SU_BIAS
|
||||||
for (i=0;i<3*N;i++)
|
for (i=0;i<3*N;i++)
|
||||||
recur[i] = gru->bias[3*N + i];
|
recur[i] = gru->subias[3*N + i];
|
||||||
|
#else
|
||||||
|
for (i=0;i<3*N;i++)
|
||||||
|
recur[i] = gru->bias[3*N + i];
|
||||||
|
#endif
|
||||||
for (k=0;k<3;k++)
|
for (k=0;k<3;k++)
|
||||||
{
|
{
|
||||||
for (i=0;i<N;i++)
|
for (i=0;i<N;i++)
|
||||||
recur[k*N + i] += gru->diag_weights[k*N + i]*state[i];
|
recur[k*N + i] += gru->diag_weights[k*N + i]*state[i];
|
||||||
}
|
}
|
||||||
sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, gru->idx, state);
|
sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, 3*N, gru->idx, state);
|
||||||
for (i=0;i<2*N;i++)
|
for (i=0;i<2*N;i++)
|
||||||
zrh[i] += recur[i];
|
zrh[i] += recur[i];
|
||||||
compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
|
compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
|
||||||
|
|
|
@ -28,18 +28,14 @@
|
||||||
#ifndef _NNET_H_
|
#ifndef _NNET_H_
|
||||||
#define _NNET_H_
|
#define _NNET_H_
|
||||||
|
|
||||||
|
#include "vec.h"
|
||||||
|
|
||||||
#define ACTIVATION_LINEAR 0
|
#define ACTIVATION_LINEAR 0
|
||||||
#define ACTIVATION_SIGMOID 1
|
#define ACTIVATION_SIGMOID 1
|
||||||
#define ACTIVATION_TANH 2
|
#define ACTIVATION_TANH 2
|
||||||
#define ACTIVATION_RELU 3
|
#define ACTIVATION_RELU 3
|
||||||
#define ACTIVATION_SOFTMAX 4
|
#define ACTIVATION_SOFTMAX 4
|
||||||
|
|
||||||
#ifdef DOT_PROD
|
|
||||||
typedef signed char qweight;
|
|
||||||
#else
|
|
||||||
typedef float qweight;
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const float *bias;
|
const float *bias;
|
||||||
const float *input_weights;
|
const float *input_weights;
|
||||||
|
@ -70,6 +66,7 @@ typedef struct {
|
||||||
|
|
||||||
typedef struct {
|
typedef struct {
|
||||||
const float *bias;
|
const float *bias;
|
||||||
|
const float *subias;
|
||||||
const float *diag_weights;
|
const float *diag_weights;
|
||||||
const qweight *recurrent_weights;
|
const qweight *recurrent_weights;
|
||||||
const int *idx;
|
const int *idx;
|
||||||
|
|
|
@ -1,5 +1,8 @@
|
||||||
/* This file is auto-generated by gen_tables */
|
/* This file is auto-generated by gen_tables */
|
||||||
|
|
||||||
|
#ifndef TANSIG_TABLE_H
|
||||||
|
#define TANSIG_TABLE_H
|
||||||
|
|
||||||
static const float tansig_table[201] = {
|
static const float tansig_table[201] = {
|
||||||
0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
|
0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
|
||||||
0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
|
0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
|
||||||
|
@ -43,3 +46,5 @@ static const float tansig_table[201] = {
|
||||||
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
|
1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
|
||||||
1.000000f,
|
1.000000f,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
#endif /*TANSIG_TABLE_H*/
|
||||||
|
|
|
@ -102,6 +102,9 @@ def dump_sparse_gru(self, f, hf):
|
||||||
weights = self.get_weights()
|
weights = self.get_weights()
|
||||||
printSparseVector(f, weights[1], name + '_recurrent_weights')
|
printSparseVector(f, weights[1], name + '_recurrent_weights')
|
||||||
printVector(f, weights[-1], name + '_bias')
|
printVector(f, weights[-1], name + '_bias')
|
||||||
|
subias = weights[-1].copy()
|
||||||
|
subias[1,:] = subias[1,:] - np.sum(np.clip(weights[1], -1, 1),axis=0)
|
||||||
|
printVector(f, subias, name + '_subias')
|
||||||
if hasattr(self, 'activation'):
|
if hasattr(self, 'activation'):
|
||||||
activation = self.activation.__name__.upper()
|
activation = self.activation.__name__.upper()
|
||||||
else:
|
else:
|
||||||
|
@ -112,8 +115,8 @@ def dump_sparse_gru(self, f, hf):
|
||||||
reset_after = 1
|
reset_after = 1
|
||||||
neurons = weights[0].shape[1]//3
|
neurons = weights[0].shape[1]//3
|
||||||
max_rnn_neurons = max(max_rnn_neurons, neurons)
|
max_rnn_neurons = max(max_rnn_neurons, neurons)
|
||||||
f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n'
|
f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n'
|
||||||
.format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
|
.format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
|
||||||
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
|
hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
|
||||||
hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
|
hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
|
||||||
hf.write('extern const SparseGRULayer {};\n\n'.format(name));
|
hf.write('extern const SparseGRULayer {};\n\n'.format(name));
|
||||||
|
|
132
dnn/vec.h
132
dnn/vec.h
|
@ -26,11 +26,33 @@
|
||||||
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
#include "nnet.h"
|
#ifndef VEC_H
|
||||||
|
#define VEC_H
|
||||||
|
|
||||||
|
#include "tansig_table.h"
|
||||||
|
#include "opus_types.h"
|
||||||
|
#include <math.h>
|
||||||
|
#include "arch.h"
|
||||||
|
|
||||||
|
#ifdef DOT_PROD
|
||||||
|
typedef signed char qweight;
|
||||||
|
#else
|
||||||
|
typedef float qweight;
|
||||||
|
#endif
|
||||||
|
|
||||||
|
#ifdef __AVX__
|
||||||
|
#include "vec_avx.h"
|
||||||
|
#elif __ARM_NEON__
|
||||||
|
#include "vec_neon.h"
|
||||||
|
#else
|
||||||
|
|
||||||
|
//#define USE_SU_BIAS
|
||||||
|
|
||||||
|
#define NO_OPTIMIZATIONS
|
||||||
|
|
||||||
/* No AVX2/FMA support */
|
/* No AVX2/FMA support */
|
||||||
#ifndef LPCNET_TEST
|
#ifndef LPCNET_TEST
|
||||||
static float celt_exp2(float x)
|
static inline float celt_exp2(float x)
|
||||||
{
|
{
|
||||||
int integer;
|
int integer;
|
||||||
float frac;
|
float frac;
|
||||||
|
@ -50,7 +72,7 @@ static float celt_exp2(float x)
|
||||||
}
|
}
|
||||||
#define celt_exp(x) celt_exp2((x)*1.44269504f)
|
#define celt_exp(x) celt_exp2((x)*1.44269504f)
|
||||||
|
|
||||||
static float tansig_approx(float x)
|
static inline float tansig_approx(float x)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
float y, dy;
|
float y, dy;
|
||||||
|
@ -69,19 +91,19 @@ static float tansig_approx(float x)
|
||||||
return sign*y;
|
return sign*y;
|
||||||
}
|
}
|
||||||
|
|
||||||
static OPUS_INLINE float sigmoid_approx(float x)
|
static inline float sigmoid_approx(float x)
|
||||||
{
|
{
|
||||||
return .5f + .5f*tansig_approx(.5f*x);
|
return .5f + .5f*tansig_approx(.5f*x);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void softmax(float *y, const float *x, int N)
|
static inline void softmax(float *y, const float *x, int N)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
for (i=0;i<N;i++)
|
for (i=0;i<N;i++)
|
||||||
y[i] = celt_exp(x[i]);
|
y[i] = celt_exp(x[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void vec_tanh(float *y, const float *x, int N)
|
static inline void vec_tanh(float *y, const float *x, int N)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
for (i=0;i<N;i++)
|
for (i=0;i<N;i++)
|
||||||
|
@ -90,7 +112,7 @@ static void vec_tanh(float *y, const float *x, int N)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void vec_sigmoid(float *y, const float *x, int N)
|
static inline void vec_sigmoid(float *y, const float *x, int N)
|
||||||
{
|
{
|
||||||
int i;
|
int i;
|
||||||
for (i=0;i<N;i++)
|
for (i=0;i<N;i++)
|
||||||
|
@ -99,7 +121,7 @@ static void vec_sigmoid(float *y, const float *x, int N)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
|
static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
for (i=0;i<rows;i+=16)
|
for (i=0;i<rows;i+=16)
|
||||||
|
@ -132,7 +154,7 @@ static void sgemv_accum16(float *out, const float *weights, int rows, int cols,
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
|
static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
for (i=0;i<rows;i+=16)
|
for (i=0;i<rows;i+=16)
|
||||||
|
@ -167,42 +189,90 @@ static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef DOT_PROD
|
#ifdef DOT_PROD
|
||||||
|
|
||||||
|
#define MAX_INPUTS (2048)
|
||||||
|
|
||||||
|
|
||||||
|
#define SCALE (128.f*127.f)
|
||||||
#define SCALE_1 (1.f/128.f/127.f)
|
#define SCALE_1 (1.f/128.f/127.f)
|
||||||
static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x)
|
|
||||||
|
#ifdef USE_SU_BIAS
|
||||||
|
static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
unsigned x[MAX_INPUTS];
|
||||||
|
for (i=0;i<rows;i++) out[i] *= SCALE;
|
||||||
|
for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);
|
||||||
for (i=0;i<rows;i+=8)
|
for (i=0;i<rows;i+=8)
|
||||||
{
|
{
|
||||||
int cols;
|
int colblocks;
|
||||||
cols = *idx++;
|
colblocks = *idx++;
|
||||||
for (j=0;j<cols;j++)
|
for (j=0;j<colblocks;j++)
|
||||||
{
|
{
|
||||||
int pos;
|
int pos;
|
||||||
float * restrict y;
|
float * restrict y;
|
||||||
int xj0, xj1, xj2, xj3;
|
int xj0, xj1, xj2, xj3;
|
||||||
pos = 4 * (*idx++);
|
pos = 4 * (*idx++);
|
||||||
xj0 = floor(.5+127*x[pos+0]);
|
xj0 = x[pos+0];
|
||||||
xj1 = floor(.5+127*x[pos+1]);
|
xj1 = x[pos+1];
|
||||||
xj2 = floor(.5+127*x[pos+2]);
|
xj2 = x[pos+2];
|
||||||
xj3 = floor(.5+127*x[pos+3]);
|
xj3 = x[pos+3];
|
||||||
y = &out[i];
|
y = &out[i];
|
||||||
y[0] += SCALE_1*(w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
|
y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
|
||||||
y[1] += SCALE_1*(w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
|
y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
|
||||||
y[2] += SCALE_1*(w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
|
y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
|
||||||
y[3] += SCALE_1*(w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
|
y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
|
||||||
y[4] += SCALE_1*(w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
|
y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
|
||||||
y[5] += SCALE_1*(w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
|
y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
|
||||||
y[6] += SCALE_1*(w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
|
y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
|
||||||
y[7] += SCALE_1*(w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
|
y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
|
||||||
w += 32;
|
w += 32;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
for (i=0;i<rows;i++) out[i] *= SCALE_1;
|
||||||
}
|
}
|
||||||
|
#else /*USE_SU_BIAS*/
|
||||||
#else
|
static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
|
||||||
static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x)
|
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
signed x[MAX_INPUTS];
|
||||||
|
for (i=0;i<rows;i++) out[i] *= SCALE;
|
||||||
|
for (i=0;i<cols;i++) x[i] = floor(.5+127*_x[i]);
|
||||||
|
for (i=0;i<rows;i+=8)
|
||||||
|
{
|
||||||
|
int colblocks;
|
||||||
|
colblocks = *idx++;
|
||||||
|
for (j=0;j<colblocks;j++)
|
||||||
|
{
|
||||||
|
int pos;
|
||||||
|
float * restrict y;
|
||||||
|
int xj0, xj1, xj2, xj3;
|
||||||
|
pos = 4 * (*idx++);
|
||||||
|
xj0 = x[pos+0];
|
||||||
|
xj1 = x[pos+1];
|
||||||
|
xj2 = x[pos+2];
|
||||||
|
xj3 = x[pos+3];
|
||||||
|
y = &out[i];
|
||||||
|
y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
|
||||||
|
y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
|
||||||
|
y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
|
||||||
|
y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
|
||||||
|
y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
|
||||||
|
y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
|
||||||
|
y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
|
||||||
|
y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
|
||||||
|
w += 32;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
for (i=0;i<rows;i++) out[i] *= SCALE_1;
|
||||||
|
}
|
||||||
|
#endif /*USE_SU_BIAS*/
|
||||||
|
|
||||||
|
#else /*DOT_PROD*/
|
||||||
|
static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int ignore, const int *idx, const float *x)
|
||||||
|
{
|
||||||
|
int i, j;
|
||||||
|
(void)ignore;
|
||||||
for (i=0;i<rows;i+=8)
|
for (i=0;i<rows;i+=8)
|
||||||
{
|
{
|
||||||
int cols;
|
int cols;
|
||||||
|
@ -257,4 +327,8 @@ static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif /*DOT_PROD*/
|
||||||
|
|
||||||
|
|
||||||
|
#endif /*no optimizations*/
|
||||||
|
#endif /*VEC_H*/
|
||||||
|
|
|
@ -29,6 +29,9 @@
|
||||||
AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma
|
AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma
|
||||||
*/
|
*/
|
||||||
|
|
||||||
|
#ifndef VEC_AVX_H
|
||||||
|
#define VEC_AVX_H
|
||||||
|
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
|
|
||||||
#ifdef __AVX2__
|
#ifdef __AVX2__
|
||||||
|
@ -246,9 +249,10 @@ static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows,
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, const int *idx, const float *x)
|
static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, int ignore, const int *idx, const float *x)
|
||||||
{
|
{
|
||||||
int i, j;
|
int i, j;
|
||||||
|
(void)ignore;
|
||||||
for (i=0;i<rows;i+=8)
|
for (i=0;i<rows;i+=8)
|
||||||
{
|
{
|
||||||
float * restrict y;
|
float * restrict y;
|
||||||
|
@ -286,3 +290,4 @@ static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows,
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
#endif /*VEC_AVX_H*/
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue