diff --git a/dnn/dump_lpcnet.py b/dnn/dump_lpcnet.py
index 506da472..0e2dda3f 100755
--- a/dnn/dump_lpcnet.py
+++ b/dnn/dump_lpcnet.py
@@ -91,18 +91,22 @@ def dump_gru_layer(self, f, hf):
 CuDNNGRU.dump_layer = dump_gru_layer
 GRU.dump_layer = dump_gru_layer
 
+def dump_dense_layer_impl(name, weights, bias, activation, f, hf):
+    printVector(f, weights, name + '_weights')
+    printVector(f, bias, name + '_bias')
+    f.write('const DenseLayer {} = {{\n   {}_bias,\n   {}_weights,\n   {}, {}, ACTIVATION_{}\n}};\n\n'
+            .format(name, name, name, weights.shape[0], weights.shape[1], activation))
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))
+    hf.write('extern const DenseLayer {};\n\n'.format(name));
+
 def dump_dense_layer(self, f, hf):
     name = self.name
     print("printing layer " + name + " of type " + self.__class__.__name__)
     weights = self.get_weights()
-    printVector(f, weights[0], name + '_weights')
-    printVector(f, weights[-1], name + '_bias')
     activation = self.activation.__name__.upper()
-    f.write('const DenseLayer {} = {{\n   {}_bias,\n   {}_weights,\n   {}, {}, ACTIVATION_{}\n}};\n\n'
-            .format(name, name, name, weights[0].shape[0], weights[0].shape[1], activation))
-    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]))
-    hf.write('extern const DenseLayer {};\n\n'.format(name));
+    dump_dense_layer_impl(name, weights[0], weights[1], activation, f, hf)
     return False
+
 Dense.dump_layer = dump_dense_layer
 
 def dump_mdense_layer(self, f, hf):
@@ -141,15 +145,18 @@ def dump_conv1d_layer(self, f, hf):
 Conv1D.dump_layer = dump_conv1d_layer
 
 
+def dump_embedding_layer_impl(name, weights, f, hf):
+    printVector(f, weights, name + '_weights')
+    f.write('const EmbeddingLayer {} = {{\n   {}_weights,\n   {}, {}\n}};\n\n'
+            .format(name, name, weights.shape[0], weights.shape[1]))
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights.shape[1]))
+    hf.write('extern const EmbeddingLayer {};\n\n'.format(name));
+
 def dump_embedding_layer(self, f, hf):
     name = self.name
     print("printing layer " + name + " of type " + self.__class__.__name__)
-    weights = self.get_weights()
-    printVector(f, weights[0], name + '_weights')
-    f.write('const EmbeddingLayer {} = {{\n   {}_weights,\n   {}, {}\n}};\n\n'
-            .format(name, name, weights[0].shape[0], weights[0].shape[1]))
-    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]))
-    hf.write('extern const EmbeddingLayer {};\n\n'.format(name));
+    weights = self.get_weights()[0]
+    dump_embedding_layer_impl(name, weights, f, hf)
     return False
 Embedding.dump_layer = dump_embedding_layer
 
@@ -178,6 +185,21 @@ f.write('#ifdef HAVE_CONFIG_H\n#include "config.h"\n#endif\n\n#include "nnet.h"\
 hf.write('/*This file is automatically generated from a Keras model*/\n\n')
 hf.write('#ifndef RNN_DATA_H\n#define RNN_DATA_H\n\n#include "nnet.h"\n\n')
 
+embed_size = lpcnet.embed_size
+
+E = model.get_layer('embed_sig').get_weights()[0]
+W = model.layers[18].get_weights()[0][:embed_size,:]
+dump_embedding_layer_impl('gru_a_embed_sig', np.dot(E, W), f, hf)
+W = model.layers[18].get_weights()[0][embed_size:2*embed_size,:]
+dump_embedding_layer_impl('gru_a_embed_pred', np.dot(E, W), f, hf)
+E = model.get_layer('embed_exc').get_weights()[0]
+W = model.layers[18].get_weights()[0][2*embed_size:3*embed_size,:]
+dump_embedding_layer_impl('gru_a_embed_exc', np.dot(E, W), f, hf)
+W = model.layers[18].get_weights()[0][3*embed_size:,:]
+#FIXME: dump only half the biases
+b = model.layers[18].get_weights()[2]
+dump_dense_layer_impl('gru_a_dense_feature', W, b, 'LINEAR', f, hf)
+
 layer_list = []
 for i, layer in enumerate(model.layers):
     if layer.dump_layer(f, hf):
diff --git a/dnn/lpcnet.c b/dnn/lpcnet.c
index 30451284..f6827a7c 100644
--- a/dnn/lpcnet.c
+++ b/dnn/lpcnet.c
@@ -116,12 +116,17 @@ void run_frame_network(LPCNetState *lpcnet, float *condition, const float *featu
 void run_sample_network(NNetState *net, float *pdf, const float *condition, int last_exc, int last_sig, int pred)
 {
     float in_a[SAMPLE_INPUT_SIZE];
+    float gru_a_input[3*GRU_A_STATE_SIZE];
     float in_b[GRU_A_STATE_SIZE+FEATURE_DENSE2_OUT_SIZE];
     compute_embedding(&embed_sig, &in_a[0], last_sig);
     compute_embedding(&embed_sig, &in_a[EMBED_SIG_OUT_SIZE], pred);
     compute_embedding(&embed_exc, &in_a[2*EMBED_SIG_OUT_SIZE], last_exc);
     RNN_COPY(&in_a[2*EMBED_SIG_OUT_SIZE + EMBED_EXC_OUT_SIZE], condition, FEATURE_DENSE2_OUT_SIZE);
-    compute_gru2(&gru_a, net->gru_a_state, in_a);
+    compute_dense(&gru_a_dense_feature, gru_a_input, condition);
+    accum_embedding(&gru_a_embed_sig, gru_a_input, last_sig);
+    accum_embedding(&gru_a_embed_pred, gru_a_input, pred);
+    accum_embedding(&gru_a_embed_exc, gru_a_input, last_exc);
+    compute_gru3(&gru_a, net->gru_a_state, gru_a_input);
     RNN_COPY(in_b, net->gru_a_state, GRU_A_STATE_SIZE);
     RNN_COPY(&in_b[GRU_A_STATE_SIZE], condition, FEATURE_DENSE2_OUT_SIZE);
     compute_gru2(&gru_b, net->gru_b_state, in_b);
diff --git a/dnn/nnet.c b/dnn/nnet.c
index a8fa704f..9d32fb3b 100644
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -256,6 +256,40 @@ void compute_gru2(const GRULayer *gru, float *state, const float *input)
       state[i] = h[i];
 }
 
+void compute_gru3(const GRULayer *gru, float *state, const float *input)
+{
+   int i;
+   int N;
+   int stride;
+   float zrh[3*MAX_RNN_NEURONS];
+   float recur[3*MAX_RNN_NEURONS];
+   float *z;
+   float *r;
+   float *h;
+   N = gru->nb_neurons;
+   z = zrh;
+   r = &zrh[N];
+   h = &zrh[2*N];
+   celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS);
+   celt_assert(input != state);
+   celt_assert(gru->reset_after);
+   stride = 3*N;
+   RNN_COPY(zrh, input, 3*N);
+   for (i=0;i<3*N;i++)
+      recur[i] = gru->bias[3*N + i];
+   gemm_accum(recur, gru->recurrent_weights, 3*N, N, stride, state);
+   for (i=0;i<2*N;i++)
+      zrh[i] += recur[i];
+   compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
+   for (i=0;i<N;i++)
+      h[i] += recur[2*N+i]*r[i];
+   compute_activation(h, h, N, gru->activation);
+   for (i=0;i<N;i++)
+      h[i] = z[i]*state[i] + (1-z[i])*h[i];
+   for (i=0;i<N;i++)
+      state[i] = h[i];
+}
+
 void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input)
 {
    int i;
@@ -288,6 +322,18 @@ void compute_embedding(const EmbeddingLayer *layer, float *output, int input)
    }    
 }
 
+void accum_embedding(const EmbeddingLayer *layer, float *output, int input)
+{
+   int i;
+   celt_assert(input >= 0);
+   celt_assert(input < layer->nb_inputs);
+   /*if (layer->dim == 64) printf("%d\n", input);*/
+   for (i=0;i<layer->dim;i++)
+   {
+      output[i] += layer->embedding_weights[input*layer->dim + i];
+   }    
+}
+
 int sample_from_pdf(const float *pdf, int N, float exp_boost, float pdf_floor)
 {
     int i;
diff --git a/dnn/nnet.h b/dnn/nnet.h
index a3e2d2aa..1154b404 100644
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -87,10 +87,14 @@ void compute_gru(const GRULayer *gru, float *state, const float *input);
 
 void compute_gru2(const GRULayer *gru, float *state, const float *input);
 
+void compute_gru3(const GRULayer *gru, float *state, const float *input);
+
 void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input);
 
 void compute_embedding(const EmbeddingLayer *layer, float *output, int input);
 
+void accum_embedding(const EmbeddingLayer *layer, float *output, int input);
+
 int sample_from_pdf(const float *pdf, int N, float exp_boost, float pdf_floor);
 
 #endif /* _MLP_H_ */