diff --git a/dnn/dump_lpcnet.py b/dnn/dump_lpcnet.py
index 0e2dda3f..870cfc8d 100755
--- a/dnn/dump_lpcnet.py
+++ b/dnn/dump_lpcnet.py
@@ -41,10 +41,10 @@ max_rnn_neurons = 1
 max_conv_inputs = 1
 max_mdense_tmp = 1
 
-def printVector(f, vector, name):
+def printVector(f, vector, name, dtype='float'):
     v = np.reshape(vector, (-1));
     #print('static const float ', name, '[', len(v), '] = \n', file=f)
-    f.write('static const float {}[{}] = {{\n   '.format(name, len(v)))
+    f.write('static const {} {}[{}] = {{\n   '.format(dtype, name, len(v)))
     for i in range(0, len(v)):
         f.write('{}'.format(v[i]))
         if (i!=len(v)-1):
@@ -59,11 +59,51 @@ def printVector(f, vector, name):
     f.write('\n};\n\n')
     return;
 
+def printSparseVector(f, A, name):
+    N = A.shape[0]
+    W = np.zeros((0,))
+    diag = np.concatenate([np.diag(A[:,:N]), np.diag(A[:,N:2*N]), np.diag(A[:,2*N:])])
+    A[:,:N] = A[:,:N] - np.diag(np.diag(A[:,:N]))
+    A[:,N:2*N] = A[:,N:2*N] - np.diag(np.diag(A[:,N:2*N]))
+    A[:,2*N:] = A[:,2*N:] - np.diag(np.diag(A[:,2*N:]))
+    printVector(f, diag, name + '_diag')
+    for i in range(3*N//16):
+        for j in range(N):
+            W = np.concatenate([W, A[j, i*16:(i+1)*16]])
+    printVector(f, W, name)
+    idx = np.tile(np.concatenate([np.array([N]), np.arange(N)]), 3*N//16)
+    printVector(f, idx, name + '_idx', dtype='int')
+    return;
+
 def dump_layer_ignore(self, f, hf):
     print("ignoring layer " + self.name + " of type " + self.__class__.__name__)
     return False
 Layer.dump_layer = dump_layer_ignore
 
+def dump_sparse_gru(self, f, hf):
+    global max_rnn_neurons
+    name = 'sparse_' + self.name
+    print("printing layer " + name + " of type sparse " + self.__class__.__name__)
+    weights = self.get_weights()
+    printSparseVector(f, weights[1], name + '_recurrent_weights')
+    printVector(f, weights[-1], name + '_bias')
+    if hasattr(self, 'activation'):
+        activation = self.activation.__name__.upper()
+    else:
+        activation = 'TANH'
+    if hasattr(self, 'reset_after') and not self.reset_after:
+        reset_after = 0
+    else:
+        reset_after = 1
+    neurons = weights[0].shape[1]//3
+    max_rnn_neurons = max(max_rnn_neurons, neurons)
+    f.write('const SparseGRULayer {} = {{\n   {}_bias,\n   {}_recurrent_weights_diag,\n   {}_recurrent_weights,\n   {}_recurrent_weights_idx,\n   {}, ACTIVATION_{}, {}\n}};\n\n'
+            .format(name, name, name, name, name, weights[0].shape[1]//3, activation, reset_after))
+    hf.write('#define {}_OUT_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    hf.write('#define {}_STATE_SIZE {}\n'.format(name.upper(), weights[0].shape[1]//3))
+    hf.write('extern const SparseGRULayer {};\n\n'.format(name));
+    return True
+
 def dump_gru_layer(self, f, hf):
     global max_rnn_neurons
     name = self.name
@@ -205,6 +245,8 @@ for i, layer in enumerate(model.layers):
     if layer.dump_layer(f, hf):
         layer_list.append(layer.name)
 
+dump_sparse_gru(model.get_layer('gru_a'), f, hf)
+
 hf.write('#define MAX_RNN_NEURONS {}\n\n'.format(max_rnn_neurons))
 hf.write('#define MAX_CONV_INPUTS {}\n\n'.format(max_conv_inputs))
 hf.write('#define MAX_MDENSE_TMP {}\n\n'.format(max_mdense_tmp))
diff --git a/dnn/lpcnet.c b/dnn/lpcnet.c
index 04722900..fa391fe3 100644
--- a/dnn/lpcnet.c
+++ b/dnn/lpcnet.c
@@ -122,7 +122,8 @@ void run_sample_network(NNetState *net, float *pdf, const float *condition, cons
     accum_embedding(&gru_a_embed_sig, gru_a_input, last_sig);
     accum_embedding(&gru_a_embed_pred, gru_a_input, pred);
     accum_embedding(&gru_a_embed_exc, gru_a_input, last_exc);
-    compute_gru3(&gru_a, net->gru_a_state, gru_a_input);
+    /*compute_gru3(&gru_a, net->gru_a_state, gru_a_input);*/
+    compute_sparse_gru(&sparse_gru_a, net->gru_a_state, gru_a_input);
     RNN_COPY(in_b, net->gru_a_state, GRU_A_STATE_SIZE);
     RNN_COPY(&in_b[GRU_A_STATE_SIZE], condition, FEATURE_DENSE2_OUT_SIZE);
     compute_gru2(&gru_b, net->gru_b_state, in_b);
diff --git a/dnn/nnet.c b/dnn/nnet.c
index 3d3a3a0a..d6360876 100644
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@@ -105,6 +105,38 @@ static void gemm_accum16(float *out, const float *weights, int rows, int cols, i
       _mm256_storeu_ps (&y[8], vy8);
    }
 }
+static void sparse_gemm_accum16(float *out, const float *weights, int rows, const int *idx, const float *x)
+{
+   int i, j;
+   for (i=0;i<rows;i+=16)
+   {
+      float * restrict y;
+      int cols;
+      __m256 vy0, vy8;
+      y = &out[i];
+      vy0 = _mm256_loadu_ps(&y[0]);
+      vy8 = _mm256_loadu_ps(&y[8]);
+      cols = *idx++;
+      for (j=0;j<cols;j++)
+      {
+         int id;
+         __m256 vxj;
+         __m256 vw;
+         id = *idx++;
+         vxj = _mm256_broadcast_ss(&x[id]);
+
+         vw = _mm256_loadu_ps(&weights[0]);
+         vy0 = _mm256_fmadd_ps(vw, vxj, vy0);
+
+         vw = _mm256_loadu_ps(&weights[8]);
+         vy8 = _mm256_fmadd_ps(vw, vxj, vy8);
+         weights += 16;
+      }
+      _mm256_storeu_ps (&y[0], vy0);
+      _mm256_storeu_ps (&y[8], vy8);
+   }
+}
+
 #else
 static void gemm_accum16(float *out, const float *weights, int rows, int cols, int col_stride, const float *x)
 {
@@ -358,6 +390,43 @@ void compute_gru3(const GRULayer *gru, float *state, const float *input)
       state[i] = h[i];
 }
 
+void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *input)
+{
+   int i, k;
+   int N;
+   float zrh[3*MAX_RNN_NEURONS];
+   float recur[3*MAX_RNN_NEURONS];
+   float *z;
+   float *r;
+   float *h;
+   N = gru->nb_neurons;
+   z = zrh;
+   r = &zrh[N];
+   h = &zrh[2*N];
+   celt_assert(gru->nb_neurons <= MAX_RNN_NEURONS);
+   celt_assert(input != state);
+   celt_assert(gru->reset_after);
+   RNN_COPY(zrh, input, 3*N);
+   for (i=0;i<3*N;i++)
+      recur[i] = gru->bias[3*N + i];
+   for (k=0;k<3;k++)
+   {
+      for (i=0;i<N;i++)
+         recur[k*N + i] += gru->diag_weights[k*N + i]*state[i];
+   }
+   sparse_gemm_accum16(recur, gru->recurrent_weights, 3*N, gru->idx, state);
+   for (i=0;i<2*N;i++)
+      zrh[i] += recur[i];
+   compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
+   for (i=0;i<N;i++)
+      h[i] += recur[2*N+i]*r[i];
+   compute_activation(h, h, N, gru->activation);
+   for (i=0;i<N;i++)
+      h[i] = z[i]*state[i] + (1-z[i])*h[i];
+   for (i=0;i<N;i++)
+      state[i] = h[i];
+}
+
 void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input)
 {
    int i;
diff --git a/dnn/nnet.h b/dnn/nnet.h
index 1154b404..14b0bced 100644
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@@ -62,6 +62,16 @@ typedef struct {
   int reset_after;
 } GRULayer;
 
+typedef struct {
+  const float *bias;
+  const float *diag_weights;
+  const float *recurrent_weights;
+  const int *idx;
+  int nb_neurons;
+  int activation;
+  int reset_after;
+} SparseGRULayer;
+
 typedef struct {
   const float *bias;
   const float *input_weights;
@@ -89,6 +99,8 @@ void compute_gru2(const GRULayer *gru, float *state, const float *input);
 
 void compute_gru3(const GRULayer *gru, float *state, const float *input);
 
+void compute_sparse_gru(const SparseGRULayer *gru, float *state, const float *input);
+
 void compute_conv1d(const Conv1DLayer *layer, float *output, float *mem, const float *input);
 
 void compute_embedding(const EmbeddingLayer *layer, float *output, int input);