WIP: signed*unsigned arithmetic

2020-12-26 02:15:57 -05:00 · 2020-12-26 02:15:57 -05:00 · bce779886d
commit bce779886d
parent 11736ca9e3
6 changed files with 134 additions and 48 deletions
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@ -39,16 +39,13 @@
 #include "nnet.h"
 #include "nnet_data.h"
 #ifdef NO_OPTIMIZATIONS
 #warning Compiling without any vectorization. This code will be very slow
 #endif
 #define SOFTMAX_HACK
 #ifdef __AVX__
 #include "vec_avx.h"
 #elif __ARM_NEON__
 #include "vec_neon.h"
 #else
 #warning Compiling without any vectorization. This code will be very slow
 #include "vec.h"
 #endif
 static OPUS_INLINE float relu(float x)
 {
@ -294,14 +291,19 @@ void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *in
   celt_assert(input != state);
   celt_assert(gru->reset_after);
   RNN_COPY(zrh, input, 3*N);
 #ifdef USE_SU_BIAS
   for (i=0;i<3*N;i++)
-      recur[i] = gru->bias[3*N + i];
+      recur[i] = gru->subias[3*N + i];
 #else
   for (i=0;i<3*N;i++)
      recur[i] = gru->bias[3*N + i];   
 #endif
   for (k=0;k<3;k++)
   {
      for (i=0;i<N;i++)
         recur[k*N + i] += gru->diag_weights[k*N + i]*state[i];
   }
-   sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, gru->idx, state);
+   sparse_sgemv_accum8x4(recur, gru->recurrent_weights, 3*N, 3*N, gru->idx, state);
   for (i=0;i<2*N;i++)
      zrh[i] += recur[i];
   compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@ -28,18 +28,14 @@
 #ifndef _NNET_H_
 #define _NNET_H_
 #include "vec.h"
 #define ACTIVATION_LINEAR  0
 #define ACTIVATION_SIGMOID 1
 #define ACTIVATION_TANH    2
 #define ACTIVATION_RELU    3
 #define ACTIVATION_SOFTMAX 4
 #ifdef DOT_PROD
 typedef signed char qweight;
 #else
 typedef float qweight;
 #endif
 typedef struct {
  const float *bias;
  const float *input_weights;
@ -70,6 +66,7 @@ typedef struct {
 typedef struct {
  const float *bias;
  const float *subias;
  const float *diag_weights;
  const qweight *recurrent_weights;
  const int *idx;
--- a/dnn/tansig_table.h
+++ b/dnn/tansig_table.h
@ -1,5 +1,8 @@
 /* This file is auto-generated by gen_tables */
 #ifndef TANSIG_TABLE_H
 #define TANSIG_TABLE_H
 static const float tansig_table[201] = {
 0.000000f, 0.039979f, 0.079830f, 0.119427f, 0.158649f,
 0.197375f, 0.235496f, 0.272905f, 0.309507f, 0.345214f,
@ -43,3 +46,5 @@ static const float tansig_table[201] = {
 1.000000f, 1.000000f, 1.000000f, 1.000000f, 1.000000f,
 1.000000f,
 };
 #endif /*TANSIG_TABLE_H*/
--- a/dnn/training_tf2/dump_lpcnet.py
+++ b/dnn/training_tf2/dump_lpcnet.py
@ -102,6 +102,9 @@ def dump_sparse_gru(self, f, hf):
    weights = self.get_weights()
    printSparseVector(f, weights[1], name + '_recurrent_weights')
    printVector(f, weights[-1], name + '_bias')
    subias = weights[-1].copy()
    subias[1,:] = subias[1,:] - np.sum(np.clip(weights[1], -1, 1),axis=0)
    printVector(f, subias, name + '_subias')
    if hasattr(self, 'activation'):
        activation = self.activation.__name__.upper()
    else:
@ -112,8 +115,8 @@ def dump_sparse_gru(self, f, hf):
        reset_after = 1
    neurons = weights[0].shape[1]//3
    max_rnn_neurons = max(max_rnn_neurons, neurons)
-    f.write('const SparseGRULayer {} = {{\n   {}_bias,\n   {}_recurrent_weights_diag,\n   {}_recurrent_weights,\n   {}_recurrent_weights_idx,\n   {}, ACTIVATION_{}, {}\n}};\n\n'
+    f.write('const SparseGRULayer {} = {{\n   {}_bias,\n   {}_subias,\n   {}_recurrent_weights_diag,\n   {}_recurrent_weights,\n   {}_recurrent_weights_idx,\n   {}, ACTIVATION_{}, {}\n}};\n\n'
-            .format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
+            .format(name, name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
    hf.write('extern const SparseGRULayer {};\n\n'.format(name));
--- a/dnn/vec.h
+++ b/dnn/vec.h
@ -26,11 +26,33 @@
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
-#include "nnet.h"
+#ifndef VEC_H
 #define VEC_H
 #include "tansig_table.h"
 #include "opus_types.h"
 #include <math.h>
 #include "arch.h"
 #ifdef DOT_PROD
 typedef signed char qweight;
 #else
 typedef float qweight;
 #endif
 #ifdef __AVX__
 #include "vec_avx.h"
 #elif __ARM_NEON__
 #include "vec_neon.h"
 #else
 //#define USE_SU_BIAS
 #define NO_OPTIMIZATIONS
 /* No AVX2/FMA support */
 #ifndef LPCNET_TEST
-static float celt_exp2(float x)
+static inline float celt_exp2(float x)
 {
   int integer;
   float frac;
@ -50,7 +72,7 @@ static float celt_exp2(float x)
 }
 #define celt_exp(x) celt_exp2((x)*1.44269504f)
-static float tansig_approx(float x)
+static inline float tansig_approx(float x)
 {
    int i;
    float y, dy;
@ -69,19 +91,19 @@ static float tansig_approx(float x)
    return sign*y;
 }
-static OPUS_INLINE float sigmoid_approx(float x)
+static inline float sigmoid_approx(float x)
 {
   return .5f + .5f*tansig_approx(.5f*x);
 }
-static void softmax(float *y, const float *x, int N)
+static inline void softmax(float *y, const float *x, int N)
 {
    int i;
    for (i=0;i<N;i++)
        y[i] = celt_exp(x[i]);
 }
-static void vec_tanh(float *y, const float *x, int N)
+static inline void vec_tanh(float *y, const float *x, int N)
 {
    int i;
    for (i=0;i<N;i++)
@ -90,7 +112,7 @@ static void vec_tanh(float *y, const float *x, int N)
    }
 }
-static void vec_sigmoid(float *y, const float *x, int N)
+static inline void vec_sigmoid(float *y, const float *x, int N)
 {
    int i;
    for (i=0;i<N;i++)
@ -99,7 +121,7 @@ static void vec_sigmoid(float *y, const float *x, int N)
    }
 }
 #endif
-static void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
+static inline void sgemv_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
 {
   int i, j;
   for (i=0;i<rows;i+=16)
@ -132,7 +154,7 @@ static void sgemv_accum16(float *out, const float *weights, int rows, int cols,
   }
 }
-static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
+static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, const int *idx, const float *x)
 {
   int i, j;
   for (i=0;i<rows;i+=16)
@ -167,42 +189,90 @@ static void sparse_sgemv_accum16(float *out, const float *w, int rows, const int
 }
 #ifdef DOT_PROD
 #define MAX_INPUTS (2048)
 #define SCALE (128.f*127.f)
 #define SCALE_1 (1.f/128.f/127.f)
-static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x)
+
 #ifdef USE_SU_BIAS
 static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
 {
   int i, j;
   unsigned x[MAX_INPUTS];
   for (i=0;i<rows;i++) out[i] *= SCALE;
   for (i=0;i<cols;i++) x[i] = 127+floor(.5+127*_x[i]);
   for (i=0;i<rows;i+=8)
   {
-      int cols;
+      int colblocks;
-      cols = *idx++;
+      colblocks = *idx++;
-      for (j=0;j<cols;j++)
+      for (j=0;j<colblocks;j++)
      {
         int pos;
         float * restrict y;
         int xj0, xj1, xj2, xj3;
         pos = 4 * (*idx++);
-         xj0 = floor(.5+127*x[pos+0]);
+         xj0 = x[pos+0];
-         xj1 = floor(.5+127*x[pos+1]);
+         xj1 = x[pos+1];
-         xj2 = floor(.5+127*x[pos+2]);
+         xj2 = x[pos+2];
-         xj3 = floor(.5+127*x[pos+3]);
+         xj3 = x[pos+3];
         y = &out[i];
-         y[0] += SCALE_1*(w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
+         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
-         y[1] += SCALE_1*(w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
+         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
-         y[2] += SCALE_1*(w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
+         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
-         y[3] += SCALE_1*(w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
+         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
-         y[4] += SCALE_1*(w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
+         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
-         y[5] += SCALE_1*(w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
+         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
-         y[6] += SCALE_1*(w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
+         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
-         y[7] += SCALE_1*(w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
+         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
         w += 32;
      }
   }
   for (i=0;i<rows;i++) out[i] *= SCALE_1;
 }
-
+#else /*USE_SU_BIAS*/
-#else
+static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int cols, const int *idx, const float *_x)
 static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const int *idx, const float *x)
 {
   int i, j;
   signed x[MAX_INPUTS];
   for (i=0;i<rows;i++) out[i] *= SCALE;
   for (i=0;i<cols;i++) x[i] = floor(.5+127*_x[i]);
   for (i=0;i<rows;i+=8)
   {
      int colblocks;
      colblocks = *idx++;
      for (j=0;j<colblocks;j++)
      {
         int pos;
         float * restrict y;
         int xj0, xj1, xj2, xj3;
         pos = 4 * (*idx++);
         xj0 = x[pos+0];
         xj1 = x[pos+1];
         xj2 = x[pos+2];
         xj3 = x[pos+3];
         y = &out[i];
         y[0] += (w[0]*xj0+w[1]*xj1+w[2]*xj2+w[3]*xj3);
         y[1] += (w[4]*xj0+w[5]*xj1+w[6]*xj2+w[7]*xj3);
         y[2] += (w[8]*xj0+w[9]*xj1+w[10]*xj2+w[11]*xj3);
         y[3] += (w[12]*xj0+w[13]*xj1+w[14]*xj2+w[15]*xj3);
         y[4] += (w[16]*xj0+w[17]*xj1+w[18]*xj2+w[19]*xj3);
         y[5] += (w[20]*xj0+w[21]*xj1+w[22]*xj2+w[23]*xj3);
         y[6] += (w[24]*xj0+w[25]*xj1+w[26]*xj2+w[27]*xj3);
         y[7] += (w[28]*xj0+w[29]*xj1+w[30]*xj2+w[31]*xj3);
         w += 32;
      }
   }
   for (i=0;i<rows;i++) out[i] *= SCALE_1;
 }
 #endif /*USE_SU_BIAS*/
 #else /*DOT_PROD*/
 static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int ignore, const int *idx, const float *x)
 {
   int i, j;
   (void)ignore;
   for (i=0;i<rows;i+=8)
   {
      int cols;
@ -257,4 +327,8 @@ static void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, const
      }
   }
 }
-#endif
+#endif /*DOT_PROD*/
 #endif /*no optimizations*/
 #endif /*VEC_H*/
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@ -29,6 +29,9 @@
  AVX2/FMA implementation of vector operations, compile with -mavx2 -mfma
 */
 #ifndef VEC_AVX_H
 #define VEC_AVX_H
 #include <immintrin.h>
 #ifdef __AVX2__
@ -246,9 +249,10 @@ static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows,
 }
 #else
-static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, const int *idx, const float *x)
+static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows, int ignore, const int *idx, const float *x)
 {
   int i, j;
   (void)ignore;
   for (i=0;i<rows;i+=8)
   {
      float * restrict y;
@ -286,3 +290,4 @@ static void sparse_sgemv_accum8x4(float *out, const qweight *weights, int rows,
 }
 #endif
 #endif /*VEC_AVX_H*/