From 40b309d92bf735af174e44e657c749bd6b5e92ba Mon Sep 17 00:00:00 2001
From: Jean-Marc Valin <jmvalin@jmvalin.ca>
Date: Tue, 29 Dec 2020 02:35:29 -0500
Subject: [PATCH] WIP: 8-bit SIMD for GRU B

---
 dnn/dump_data.c                 |  2 ++
 dnn/nnet.c                      |  6 +++++
 dnn/nnet.h                      |  2 +-
 dnn/training_tf2/dump_lpcnet.py | 10 +++++++-
 dnn/vec.h                       | 45 +++++++++++++++++++++++++++++----
 5 files changed, 58 insertions(+), 7 deletions(-)

diff --git a/dnn/dump_data.c b/dnn/dump_data.c
index 777d6487..38e22d34 100644
--- a/dnn/dump_data.c
+++ b/dnn/dump_data.c
@@ -31,6 +31,7 @@
 #include <stdlib.h>
 #include <string.h>
 #include <stdio.h>
+#include <unistd.h>
 #include "kiss_fft.h"
 #include "common.h"
 #include <math.h>
@@ -141,6 +142,7 @@ int main(int argc, char **argv) {
   int encode = 0;
   int decode = 0;
   int quantize = 0;
+  srand(getpid());
   st = lpcnet_encoder_create();
   if (argc == 5 && strcmp(argv[1], "-train")==0) training = 1;
   if (argc == 5 && strcmp(argv[1], "-qtrain")==0) {
diff --git a/dnn/nnet.c b/dnn/nnet.c
index ac23741d..012fc9bf 100644
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -140,6 +140,7 @@ void compute_mdense(const MDenseLayer *layer, float *output, const float *input)
    compute_activation(output, output, N, layer->activation);
 }
 
+#if 0
 void compute_gru(const GRULayer *gru, float *state, const float *input)
 {
    int i;
@@ -201,6 +202,7 @@ void compute_gru(const GRULayer *gru, float *state, const float *input)
    for (i=0;i<N;i++)
       state[i] = h[i];
 }
+#endif
 
 void compute_gru2(const GRULayer *gru, float *state, const float *input)
 {
@@ -224,7 +226,11 @@ void compute_gru2(const GRULayer *gru, float *state, const float *input)
    /* Compute update gate. */
    for (i=0;i<3*N;i++)
       zrh[i] = gru->bias[i];
+#if 1
+   sgemv_accum8x4(zrh, gru->input_weights, 3*N, M, stride, input);
+#else
    sgemv_accum(zrh, gru->input_weights, 3*N, M, stride, input);
+#endif
    for (i=0;i<3*N;i++)
       recur[i] = gru->bias[3*N + i];
    sgemv_accum(recur, gru->recurrent_weights, 3*N, N, stride, state);
diff --git a/dnn/nnet.h b/dnn/nnet.h
index de74be75..fb765519 100644
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -56,7 +56,7 @@ typedef struct {
 
 typedef struct {
   const float *bias;
-  const float *input_weights;
+  const qweight *input_weights;
   const float *recurrent_weights;
   int nb_inputs;
   int nb_neurons;
diff --git a/dnn/training_tf2/dump_lpcnet.py b/dnn/training_tf2/dump_lpcnet.py
index 6daf517a..cfdde727 100755
--- a/dnn/training_tf2/dump_lpcnet.py
+++ b/dnn/training_tf2/dump_lpcnet.py
@@ -39,7 +39,10 @@ max_rnn_neurons = 1
 max_conv_inputs = 1
 max_mdense_tmp = 1
 
-def printVector(f, vector, name, dtype='float'):
+def printVector(f, vector, name, dtype='float', dotp=False):
+    if dotp:
+        vector = vector.reshape((vector.shape[0]//4, 4, vector.shape[1]//8, 8))
+        vector = vector.transpose((2, 0, 3, 1))
     v = np.reshape(vector, (-1));
     #print('static const float ', name, '[', len(v), '] = \n', file=f)
     f.write('static const {} {}[{}] = {{\n   '.format(dtype, name, len(v)))
@@ -127,7 +130,12 @@ def dump_gru_layer(self, f, hf):
     name = self.name
     print("printing layer " + name + " of type " + self.__class__.__name__)
     weights = self.get_weights()
+    f.write('#ifdef DOT_PROD\n')
+    qweight = np.clip((128*weights[0]).astype('int'), -128, 127)
+    printVector(f, qweight, name + '_weights', dotp=True, dtype='qweight')
+    f.write('#else /*DOT_PROD*/\n')
     printVector(f, weights[0], name + '_weights')
+    f.write('#endif /*DOT_PROD*/\n')
     printVector(f, weights[1], name + '_recurrent_weights')
     printVector(f, weights[-1], name + '_bias')
     if hasattr(self, 'activation'):
diff --git a/dnn/vec.h b/dnn/vec.h
index 5bf73941..dd55d998 100644
--- a/dnn/vec.h
+++ b/dnn/vec.h
@@ -41,10 +41,11 @@
 #include "vec_neon.h"
 #else
 
+#define MAX_INPUTS (2048)
 
 #define NO_OPTIMIZATIONS
 
-//#define DOT_PROD
+#define DOT_PROD
 //#define USE_SU_BIAS
 
 #ifdef DOT_PROD
@@ -193,13 +194,47 @@ static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, co
 }
 
 #ifdef DOT_PROD
-
-#define MAX_INPUTS (2048)
-
-
 #define SCALE (128.f*127.f)
 #define SCALE_1 (1.f/128.f/127.f)
 
+static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, int col_stride, const float *_x)
+{
+   int i, j;
+   signed char x[MAX_INPUTS];
+   (void)col_stride;
+   for (i=0;i<rows;i++) out[i] *= SCALE;
+   for (i=0;i<cols;i++) x[i] = (int)floor(.5+127*_x[i]);
+   for (i=0;i<rows;i+=8)
+   {
+      for (j=0;j<cols;j+=4)
+      {
+         float * restrict y;
+         float xj0, xj1, xj2, xj3;
+         xj0 = x[j+0];
+         xj1 = x[j+1];
+         xj2 = x[j+2];
+         xj3 = x[j+3];
+         y = &out[i];
+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+         w += 32;
+      }
+   }
+   for (i=0;i<rows;i++) out[i] *= SCALE_1;
+}
+#else
+#define sgemv_accum sgemv_accum8x4
+#endif
+
+#ifdef DOT_PROD
+
+
 #ifdef USE_SU_BIAS
 static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
 {