diff --git a/dnn/nnet.c b/dnn/nnet.c index 0d81562a..26027eed 100644 --- a/dnn/nnet.c +++ b/dnn/nnet.c @@ -39,16 +39,13 @@ #include "nnet.h" #include "nnet_data.h" +#ifdef NO_OPTIMIZATIONS +#warning Compiling without any vectorization. This code will be very slow +#endif + + #define SOFTMAX_HACK -#ifdef __AVX__ -#include "vec_avx.h" -#elif __ARM_NEON__ -#include "vec_neon.h" -#else -#warning Compiling without any vectorization. This code will be very slow -#include "vec.h" -#endif static OPUS_INLINE float relu(float x) { @@ -294,14 +291,19 @@ void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *in celt_assert(input != state); celt_assert(gru->reset_after); RNN_COPY(zrh, input, 3*N); +#ifdef USE_SU_BIAS for (i=0;i<3*N;i++) - recur[i] = gru->bias[3*N + i]; + recur[i] = gru->subias[3*N + i]; +#else + for (i=0;i<3*N;i++) + recur[i] = gru->bias[3*N + i]; +#endif for (k=0;k<3;k++) { for (i=0;idiag_weights[k*N + i]*state[i]; } - sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, gru->idx, state); + sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, 3*N, gru->idx, state); for (i=0;i<2*N;i++) zrh[i] += recur[i]; compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID); diff --git a/dnn/nnet.h b/dnn/nnet.h index a0033de6..de74be75 100644 --- a/dnn/nnet.h +++ b/dnn/nnet.h @@ -28,18 +28,14 @@ #ifndef _NNET_H_ #define _NNET_H_ +#include "vec.h" + #define ACTIVATION_LINEAR 0 #define ACTIVATION_SIGMOID 1 #define ACTIVATION_TANH 2 #define ACTIVATION_RELU 3 #define ACTIVATION_SOFTMAX 4 -#ifdef DOT_PROD -typedef signed char qweight; -#else -typedef float qweight; -#endif - typedef struct { const float *bias; const float *input_weights; @@ -70,6 +66,7 @@ typedef struct { typedef struct { const float *bias; + const float *subias; const float *diag_weights; const qweight *recurrent_weights; const int *idx; diff --git a/dnn/tansig_table.h b/dnn/tansig_table.h index c76f844a..ebec7e3a 100644 --- a/dnn/tansig_table.h +++ b/dnn/tansig_table.h @@ -1,5 +1,8 @@ /* This file is auto-generated by gen_tables */ +#ifndef TANSIG_TABLE_H +#define TANSIG_TABLE_H + static const float tansig_table[201] = { 0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f, 0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f, @@ -43,3 +46,5 @@ static const float tansig_table[201] = { 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f, }; + +#endif /*TANSIG_TABLE_H*/ diff --git a/dnn/training_tf2/dump_lpcnet.py b/dnn/training_tf2/dump_lpcnet.py index 77982449..100d7a47 100755 --- a/dnn/training_tf2/dump_lpcnet.py +++ b/dnn/training_tf2/dump_lpcnet.py @@ -102,6 +102,9 @@ def dump_sparse_gru(self, f, hf): weights = self.get_weights() printSparseVector(f, weights[1], name + '_recurrent_weights') printVector(f, weights[-1], name + '_bias') + subias = weights[-1].copy() + subias[1,:] = subias[1,:] - np.sum(np.clip(weights[1], -1, 1),axis=0) + printVector(f, subias, name + '_subias') if hasattr(self, 'activation'): activation = self.activation.__name__.upper() else: @@ -112,8 +115,8 @@ def dump_sparse_gru(self, f, hf): reset_after = 1 neurons = weights[0].shape[1]//3 max_rnn_neurons = max(max_rnn_neurons, neurons) - f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n' - .format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after)) + f.write('const SparseGRULayer {} = {{\n {}_bias,\n {}_subias,\n {}_recurrent_weights_diag,\n {}_recurrent_weights,\n {}_recurrent_weights_idx,\n {}, ACTIVATION_{}, {}\n}};\n\n' + .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after)) hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3)) hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3)) hf.write('extern const SparseGRULayer {};\n\n'.format(name)); diff --git a/dnn/vec.h b/dnn/vec.h index 7317c173..646ed324 100644 --- a/dnn/vec.h +++ b/dnn/vec.h @@ -26,11 +26,33 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. */ -#include "nnet.h" +#ifndef VEC_H +#define VEC_H + +#include "tansig_table.h" +#include "opus_types.h" +#include +#include "arch.h" + +#ifdef DOT_PROD +typedef signed char qweight; +#else +typedef float qweight; +#endif + +#ifdef __AVX__ +#include "vec_avx.h" +#elif __ARM_NEON__ +#include "vec_neon.h" +#else + +//#define USE_SU_BIAS + +#define NO_OPTIMIZATIONS /* No AVX2/FMA support */ #ifndef LPCNET_TEST -static float celt_exp2(float x) +static inline float celt_exp2(float x) { int integer; float frac; @@ -50,7 +72,7 @@ static float celt_exp2(float x) } #define celt_exp(x) celt_exp2((x)*1.44269504f) -static float tansig_approx(float x) +static inline float tansig_approx(float x) { int i; float y, dy; @@ -69,19 +91,19 @@ static float tansig_approx(float x) return sign*y; } -static OPUS_INLINE float sigmoid_approx(float x) +static inline float sigmoid_approx(float x) { return .5f + .5f*tansig_approx(.5f*x); } -static void softmax(float *y, const float *x, int N) +static inline void softmax(float *y, const float *x, int N) { int i; for (i=0;i #ifdef __AVX2__ @@ -246,9 +249,10 @@ static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, } #else -static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, const int *idx, const float *x) +static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, int ignore, const int *idx, const float *x) { int i, j; + (void)ignore; for (i=0;i