From 40b309d92bf735af174e44e657c749bd6b5e92ba Mon Sep 17 00:00:00 2001 From: Jean-Marc Valin Date: Tue, 29 Dec 2020 02:35:29 -0500 Subject: [PATCH] WIP: 8-bit SIMD for GRU B --- dnn/dump_data.c | 2 ++ dnn/nnet.c | 6 +++++ dnn/nnet.h | 2 +- dnn/training_tf2/dump_lpcnet.py | 10 +++++++- dnn/vec.h | 45 +++++++++++++++++++++++++++++---- 5 files changed, 58 insertions(+), 7 deletions(-) diff --git a/dnn/dump_data.c b/dnn/dump_data.c index 777d6487..38e22d34 100644 --- a/dnn/dump_data.c +++ b/dnn/dump_data.c @@ -31,6 +31,7 @@ #include #include #include +#include #include "kiss_fft.h" #include "common.h" #include @@ -141,6 +142,7 @@ int main(int argc, char **argv) { int encode = 0; int decode = 0; int quantize = 0; + srand(getpid()); st = lpcnet_encoder_create(); if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1; if (argc == 5 && strcmp(argv[1], "-qtrain")==0) { diff --git a/dnn/nnet.c b/dnn/nnet.c index ac23741d..012fc9bf 100644 --- a/dnn/nnet.c +++ b/dnn/nnet.c @@ -140,6 +140,7 @@ void compute_mdense(const MDenseLayer *layer, float *output, const float *input) compute_activation(output, output, N, layer->activation); } +#if 0 void compute_gru(const GRULayer *gru, float *state, const float *input) { int i; @@ -201,6 +202,7 @@ void compute_gru(const GRULayer *gru, float *state, const float *input) for (i=0;ibias[i]; +#if 1 + sgemv_accum8x4(zrh, gru->input_weights, 3*N, M, stride, input); +#else sgemv_accum(zrh, gru->input_weights, 3*N, M, stride, input); +#endif for (i=0;i<3*N;i++) recur[i] = gru->bias[3*N + i]; sgemv_accum(recur, gru->recurrent_weights, 3*N, N, stride, state); diff --git a/dnn/nnet.h b/dnn/nnet.h index de74be75..fb765519 100644 --- a/dnn/nnet.h +++ b/dnn/nnet.h @@ -56,7 +56,7 @@ typedef struct { typedef struct { const float *bias; - const float *input_weights; + const qweight *input_weights; const float *recurrent_weights; int nb_inputs; int nb_neurons; diff --git a/dnn/training_tf2/dump_lpcnet.py b/dnn/training_tf2/dump_lpcnet.py index 6daf517a..cfdde727 100755 --- a/dnn/training_tf2/dump_lpcnet.py +++ b/dnn/training_tf2/dump_lpcnet.py @@ -39,7 +39,10 @@ max_rnn_neurons = 1 max_conv_inputs = 1 max_mdense_tmp = 1 -def printVector(f, vector, name, dtype='float'): +def printVector(f, vector, name, dtype='float', dotp=False): + if dotp: + vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8)) + vector = vector.transpose((2, 0, 3, 1)) v = np.reshape(vector, (-1)); #print('static const float ', name, '[', len(v), '] = \n', file=f) f.write('static const {} {}[{}] = {{\n '.format(dtype, name, len(v))) @@ -127,7 +130,12 @@ def dump_gru_layer(self, f, hf): name = self.name print("printing layer " + name + " of type " + self.__class__.__name__) weights = self.get_weights() + f.write('#ifdef DOT_PROD\n') + qweight = np.clip((128*weights[0]).astype('int'), -128, 127) + printVector(f, qweight, name + '_weights', dotp=True, dtype='qweight') + f.write('#else /*DOT_PROD*/\n') printVector(f, weights[0], name + '_weights') + f.write('#endif /*DOT_PROD*/\n') printVector(f, weights[1], name + '_recurrent_weights') printVector(f, weights[-1], name + '_bias') if hasattr(self, 'activation'): diff --git a/dnn/vec.h b/dnn/vec.h index 5bf73941..dd55d998 100644 --- a/dnn/vec.h +++ b/dnn/vec.h @@ -41,10 +41,11 @@ #include "vec_neon.h" #else +#define MAX_INPUTS (2048) #define NO_OPTIMIZATIONS -//#define DOT_PROD +#define DOT_PROD //#define USE_SU_BIAS #ifdef DOT_PROD @@ -193,13 +194,47 @@ static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, co } #ifdef DOT_PROD - -#define MAX_INPUTS (2048) - - #define SCALE (128.f*127.f) #define SCALE_1 (1.f/128.f/127.f) +static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x) +{ + int i, j; + signed char x[MAX_INPUTS]; + (void)col_stride; + for (i=0;i