diff --git a/dnn/nnet.h b/dnn/nnet.h
index fb765519..a5257700 100644
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -56,6 +56,7 @@ typedef struct {
 
 typedef struct {
   const float *bias;
+  const float *subias;
   const qweight *input_weights;
   const float *recurrent_weights;
   int nb_inputs;
diff --git a/dnn/training_tf2/dump_lpcnet.py b/dnn/training_tf2/dump_lpcnet.py
index cfdde727..82109483 100755
--- a/dnn/training_tf2/dump_lpcnet.py
+++ b/dnn/training_tf2/dump_lpcnet.py
@@ -138,6 +138,9 @@ def dump_gru_layer(self, f, hf):
     f.write('#endif /*DOT_PROD*/\n')
     printVector(f, weights[1], name + '_recurrent_weights')
     printVector(f, weights[-1], name + '_bias')
+    subias = weights[-1].copy()
+    subias[0,:] = subias[0,:] - np.sum(np.clip(weights[0], -1, 1),axis=0)
+    printVector(f, subias, name + '_subias')
     if hasattr(self, 'activation'):
         activation = self.activation.__name__.upper()
     else:
@@ -148,8 +151,8 @@ def dump_gru_layer(self, f, hf):
         reset_after = 1
     neurons = weights[0].shape[1]//3
     max_rnn_neurons = max(max_rnn_neurons, neurons)
-    f.write('const GRULayer {} = {{\n   {}_bias,\n   {}_weights,\n   {}_recurrent_weights,\n   {}, {}, ACTIVATION_{}, {}\n}};\n\n'
-            .format(name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))
+    f.write('const GRULayer {} = {{\n   {}_bias,\n   {}_subias,\n   {}_weights,\n   {}_recurrent_weights,\n   {}, {}, ACTIVATION_{}, {}\n}};\n\n'
+            .format(name, name, name, name, name, weights[0].shape[0], weights[0].shape[1]//3, activation, reset_after))
     hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
     hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
     hf.write('extern const GRULayer {};\n\n'.format(name));
diff --git a/dnn/vec.h b/dnn/vec.h
index dd55d998..93504b62 100644
--- a/dnn/vec.h
+++ b/dnn/vec.h
@@ -194,6 +194,7 @@ static inline void sparse_sgemv_accum16(float *out, const float *w, int rows, co
 }
 
 #ifdef DOT_PROD
+
 #define SCALE (128.f*127.f)
 #define SCALE_1 (1.f/128.f/127.f)
 
@@ -228,11 +229,6 @@ static inline void sgemv_accum8x4(float *out, const qweight *w, int rows, int co
    }
    for (i=0;i<rows;i++) out[i] *= SCALE_1;
 }
-#else
-#define sgemv_accum sgemv_accum8x4
-#endif
-
-#ifdef DOT_PROD
 
 
 #ifdef USE_SU_BIAS
@@ -308,6 +304,10 @@ static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows,
 #endif /*USE_SU_BIAS*/
 
 #else /*DOT_PROD*/
+
+#define sgemv_accum sgemv_accum8x4
+
+
 static inline void sparse_sgemv_accum8x4(float *out, const qweight *w, int rows, int ignore, const int *idx, const float *x)
 {
    int i, j;