Adding RTCD for DNN code

Starting with compute_linear()
2023-11-13 18:26:31 -05:00 · 2023-11-13 18:26:31 -05:00 · 2e034f6f31
commit 2e034f6f31
parent b0620c0bf9
31 changed files with 539 additions and 165 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -50,18 +50,30 @@ if CPU_X86
 if HAVE_RTCD
 CELT_SOURCES += $(CELT_SOURCES_X86_RTCD)
 SILK_SOURCES += $(SILK_SOURCES_X86_RTCD)
 if ENABLE_DEEP_PLC
 LPCNET_SOURCES += $(DNN_SOURCES_X86_RTCD)
 endif
 endif
 if HAVE_SSE
 CELT_SOURCES += $(CELT_SOURCES_SSE)
 endif
 if HAVE_SSE2
 CELT_SOURCES += $(CELT_SOURCES_SSE2)
 if ENABLE_DEEP_PLC
 LPCNET_SOURCES += $(DNN_SOURCES_SSE2)
 endif
 endif
 if HAVE_SSE4_1
 CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
 if ENABLE_DEEP_PLC
 LPCNET_SOURCES += $(DNN_SOURCES_SSE4_1)
 endif
 endif
 if HAVE_AVX2
 CELT_SOURCES += $(CELT_SOURCES_AVX2)
 if ENABLE_DEEP_PLC
 LPCNET_SOURCES += $(DNN_SOURCES_AVX2)
 endif
 endif
 endif
@ -398,19 +410,22 @@ $(SSE_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS)
 endif
 if HAVE_SSE2
-SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo)
+SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) \
           $(DNN_SOURCES_SSE2:.c=.lo)
 $(SSE2_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS)
 endif
 if HAVE_SSE4_1
 SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \
             $(DNN_SOURCES_SSE4_1:.c=.lo) \
             $(SILK_SOURCES_SSE4_1:.c=.lo) \
             $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo)
 $(SSE4_1_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)
 endif
 if HAVE_AVX2
-AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo)
+AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) \
           $(DNN_SOURCES_AVX2:.c=.lo)
 $(AVX2_OBJ): CFLAGS += $(OPUS_X86_AVX2_CFLAGS)
 endif
--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@ -47,7 +47,7 @@
 # endif
 # if defined(OPUS_X86_MAY_HAVE_AVX2)
-#  define MAY_HAVE_AVX2(name) name ## _avx
+#  define MAY_HAVE_AVX2(name) name ## _avx2
 # else
 #  define MAY_HAVE_AVX2(name) name ## _c
 # endif
--- a/dnn/dred_rdovae_dec.c
+++ b/dnn/dred_rdovae_dec.c
@ -42,33 +42,35 @@ static void conv1_cond_init(float *mem, int len, int dilation, int *init)
    *init = 1;
 }
-void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents)
+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch)
 {
    int i;
    RDOVAEDecState dec;
    memset(&dec, 0, sizeof(dec));
-    dred_rdovae_dec_init_states(&dec, model, state);
+    dred_rdovae_dec_init_states(&dec, model, state, arch);
    for (i = 0; i < 2*nb_latents; i += 2)
    {
        dred_rdovae_decode_qframe(
            &dec,
            model,
            &features[2*i*DRED_NUM_FEATURES],
-            &latents[(i/2)*DRED_LATENT_DIM]);
+            &latents[(i/2)*DRED_LATENT_DIM],
            arch);
    }
 }
 void dred_rdovae_dec_init_states(
    RDOVAEDecState *h,            /* io: state buffer handle */
    const RDOVAEDec *model,
-    const float *initial_state  /* i: initial state */
+    const float *initial_state,  /* i: initial state */
    int arch
    )
 {
    float hidden[DEC_HIDDEN_INIT_OUT_SIZE];
    float state_init[DEC_GRU1_STATE_SIZE+DEC_GRU2_STATE_SIZE+DEC_GRU3_STATE_SIZE+DEC_GRU4_STATE_SIZE+DEC_GRU5_STATE_SIZE];
    int counter=0;
-    compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH);
+    compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH, arch);
-    compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH);
+    compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH, arch);
    OPUS_COPY(h->gru1_state, state_init, DEC_GRU1_STATE_SIZE);
    counter += DEC_GRU1_STATE_SIZE;
    OPUS_COPY(h->gru2_state, &state_init[counter], DEC_GRU2_STATE_SIZE);
@ -86,7 +88,8 @@ void dred_rdovae_decode_qframe(
    RDOVAEDecState *dec_state,       /* io: state buffer handle */
    const RDOVAEDec *model,
    float *qframe,              /* o: quadruple feature frame (four concatenated frames in reverse order) */
-    const float *input          /* i: latent vector */
+    const float *input,          /* i: latent vector */
    int arch
    )
 {
    float buffer[DEC_DENSE1_OUT_SIZE + DEC_GRU1_OUT_SIZE + DEC_GRU2_OUT_SIZE + DEC_GRU3_OUT_SIZE + DEC_GRU4_OUT_SIZE + DEC_GRU5_OUT_SIZE
@ -94,43 +97,43 @@ void dred_rdovae_decode_qframe(
    int output_index = 0;
    /* run encoder stack and concatenate output in buffer*/
-    compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH);
+    compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);
    output_index += DEC_DENSE1_OUT_SIZE;
-    compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer);
+    compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer, arch);
-    compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state);
+    compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state, arch);
    output_index += DEC_GRU1_OUT_SIZE;
    conv1_cond_init(dec_state->conv1_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += DEC_CONV1_OUT_SIZE;
-    compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer);
+    compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer, arch);
-    compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state);
+    compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state, arch);
    output_index += DEC_GRU2_OUT_SIZE;
    conv1_cond_init(dec_state->conv2_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += DEC_CONV2_OUT_SIZE;
-    compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer);
+    compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer, arch);
-    compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state);
+    compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state, arch);
    output_index += DEC_GRU3_OUT_SIZE;
    conv1_cond_init(dec_state->conv3_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += DEC_CONV3_OUT_SIZE;
-    compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer);
+    compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer, arch);
-    compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state);
+    compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state, arch);
    output_index += DEC_GRU4_OUT_SIZE;
    conv1_cond_init(dec_state->conv4_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += DEC_CONV4_OUT_SIZE;
-    compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer);
+    compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer, arch);
-    compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state);
+    compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state, arch);
    output_index += DEC_GRU5_OUT_SIZE;
    conv1_cond_init(dec_state->conv5_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += DEC_CONV5_OUT_SIZE;
-    compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR);
+    compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR, arch);
 }
--- a/dnn/dred_rdovae_dec.h
+++ b/dnn/dred_rdovae_dec.h
@ -46,8 +46,8 @@ struct RDOVAEDecStruct {
  float conv5_state[DEC_CONV5_STATE_SIZE];
 };
-void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state);
+void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state, int arch);
-void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z);
+void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z, int arch);
-void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents);
+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch);
 #endif
--- a/dnn/dred_rdovae_enc.c
+++ b/dnn/dred_rdovae_enc.c
@ -50,7 +50,8 @@ void dred_rdovae_encode_dframe(
    const RDOVAEEnc *model,
    float *latents,                 /* o: latent vector */
    float *initial_state,           /* o: initial state */
-    const float *input              /* i: double feature frame (concatenated) */
+    const float *input,              /* i: double feature frame (concatenated) */
    int arch
    )
 {
    float padded_latents[DRED_PADDED_LATENT_DIM];
@ -61,49 +62,49 @@ void dred_rdovae_encode_dframe(
    int output_index = 0;
    /* run encoder stack and concatenate output in buffer*/
-    compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH);
+    compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);
    output_index += ENC_DENSE1_OUT_SIZE;
-    compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer);
+    compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer, arch);
    OPUS_COPY(&buffer[output_index], enc_state->gru1_state, ENC_GRU1_OUT_SIZE);
    output_index += ENC_GRU1_OUT_SIZE;
    conv1_cond_init(enc_state->conv1_state, output_index, 1, &enc_state->initialized);
-    compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += ENC_CONV1_OUT_SIZE;
-    compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer);
+    compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer, arch);
    OPUS_COPY(&buffer[output_index], enc_state->gru2_state, ENC_GRU2_OUT_SIZE);
    output_index += ENC_GRU2_OUT_SIZE;
    conv1_cond_init(enc_state->conv2_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
    output_index += ENC_CONV2_OUT_SIZE;
-    compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer);
+    compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer, arch);
    OPUS_COPY(&buffer[output_index], enc_state->gru3_state, ENC_GRU3_OUT_SIZE);
    output_index += ENC_GRU3_OUT_SIZE;
    conv1_cond_init(enc_state->conv3_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
    output_index += ENC_CONV3_OUT_SIZE;
-    compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer);
+    compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer, arch);
    OPUS_COPY(&buffer[output_index], enc_state->gru4_state, ENC_GRU4_OUT_SIZE);
    output_index += ENC_GRU4_OUT_SIZE;
    conv1_cond_init(enc_state->conv4_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
    output_index += ENC_CONV4_OUT_SIZE;
-    compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer);
+    compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer, arch);
    OPUS_COPY(&buffer[output_index], enc_state->gru5_state, ENC_GRU5_OUT_SIZE);
    output_index += ENC_GRU5_OUT_SIZE;
    conv1_cond_init(enc_state->conv5_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
    output_index += ENC_CONV5_OUT_SIZE;
-    compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR);
+    compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR, arch);
    OPUS_COPY(latents, padded_latents, DRED_LATENT_DIM);
    /* next, calculate initial state */
-    compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH);
+    compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH, arch);
-    compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR);
+    compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR, arch);
    OPUS_COPY(initial_state, padded_state, DRED_STATE_DIM);
 }
--- a/dnn/dred_rdovae_enc.h
+++ b/dnn/dred_rdovae_enc.h
@ -46,7 +46,7 @@ struct RDOVAEEncStruct {
    float conv5_state[2*ENC_CONV5_STATE_SIZE];
 };
-void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input);
+void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input, int arch);
 #endif
--- a/dnn/dump_data.c
+++ b/dnn/dump_data.c
@ -42,6 +42,7 @@
 #include "lpcnet.h"
 #include "lpcnet_private.h"
 #include "os_support.h"
 #include "cpu_support.h"
 static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
@ -135,7 +136,9 @@ int main(int argc, char **argv) {
  FILE *fnoise = NULL;
  float noise_gain = 0;
  long noise_size=0;
  int arch;
  srand(getpid());
  arch = opus_select_arch();
  st = lpcnet_encoder_create();
  argv0=argv[0];
  if (argc == 5 && strcmp(argv[1], "-btrain")==0) {
@ -244,7 +247,7 @@ int main(int argc, char **argv) {
    for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
    /* PCM is delayed by 1/2 frame to make the features centered on the frames. */
    for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
-    compute_frame_features(st, x);
+    compute_frame_features(st, x, arch);
    if (fpcm) {
        compute_noise(noisebuf, noise_std);
--- a/dnn/fargan.c
+++ b/dnn/fargan.c
@ -36,6 +36,7 @@
 #include "pitch.h"
 #include "nnet.h"
 #include "lpcnet_private.h"
 #include "cpu_support.h"
 #define FARGAN_FEATURES (NB_FEATURES)
@ -52,9 +53,9 @@ static void compute_fargan_cond(FARGANState *st, float *cond, const float *featu
  OPUS_COPY(&dense_in[NB_FEATURES], &model->cond_net_pembed.float_weights[IMAX(0,IMIN(period-32, 224))*COND_NET_PEMBED_OUT_SIZE], COND_NET_PEMBED_OUT_SIZE);
  OPUS_COPY(dense_in, features, NB_FEATURES);
-  compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH);
+  compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH, st->arch);
-  compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH);
+  compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH, st->arch);
-  compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH);
+  compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH, st->arch);
 }
 static void fargan_deemphasis(float *pcm, float *deemph_mem) {
@ -84,7 +85,7 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
  celt_assert(st->cont_initialized);
  model = &st->model;
-  compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR);
+  compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR, st->arch);
  gain = exp(gain);
  gain_1 = 1.f/(1e-5f + gain);
@ -100,26 +101,26 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
  OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE], pred, FARGAN_SUBFRAME_SIZE+4);
  OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE+FARGAN_SUBFRAME_SIZE+4], prev, FARGAN_SUBFRAME_SIZE);
-  compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH);
+  compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH, st->arch);
  celt_assert(SIG_NET_FWC0_GLU_GATE_OUT_SIZE == model->sig_net_fwc0_glu_gate.nb_outputs);
-  compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in);
+  compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in, st->arch);
-  compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID);
+  compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID, st->arch);
  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+i] = pitch_gate[0]*pred[i+2];
  OPUS_COPY(&gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
-  compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in);
+  compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in, st->arch);
-  compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state);
+  compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state, st->arch);
  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru2_in[SIG_NET_GRU1_OUT_SIZE+i] = pitch_gate[1]*pred[i+2];
  OPUS_COPY(&gru2_in[SIG_NET_GRU1_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
-  compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in);
+  compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in, st->arch);
-  compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state);
+  compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state, st->arch);
  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru3_in[SIG_NET_GRU2_OUT_SIZE+i] = pitch_gate[2]*pred[i+2];
  OPUS_COPY(&gru3_in[SIG_NET_GRU2_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
-  compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in);
+  compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in, st->arch);
-  compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state);
+  compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state, st->arch);
  OPUS_COPY(skip_cat, gru2_in, SIG_NET_GRU1_OUT_SIZE);
  OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE], gru3_in, SIG_NET_GRU2_OUT_SIZE);
@ -127,10 +128,10 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+i] = pitch_gate[3]*pred[i+2];
  OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
-  compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH);
+  compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH, st->arch);
-  compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out);
+  compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out, st->arch);
-  compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH);
+  compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH, st->arch);
  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) pcm[i] *= gain;
  OPUS_MOVE(st->pitch_buf, &st->pitch_buf[FARGAN_SUBFRAME_SIZE], PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE);
@ -174,13 +175,13 @@ void fargan_init(FARGANState *st)
 {
  int ret;
  OPUS_CLEAR(st, 1);
  st->arch = opus_select_arch();
 #ifndef USE_WEIGHTS_FILE
  ret = init_fargan(&st->model, fargan_arrays);
 #else
  ret = 0;
 #endif
  celt_assert(ret == 0);
  /* FIXME: perform arch detection. */
 }
 int fargan_load_model(FARGANState *st, const unsigned char *data, int len) {
--- a/dnn/lpcnet.h
+++ b/dnn/lpcnet.h
@ -120,7 +120,7 @@ int lpcnet_encode(LPCNetEncState *st, const opus_int16 *pcm, unsigned char *buf)
  * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
  * @retval 0 Success
  */
-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]);
+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch);
 /** Compute features on LPCNET_FRAME_SIZE speech samples (currently 160) and output features for one 10-ms frame.
@ -129,7 +129,7 @@ int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *p
  * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
  * @retval 0 Success
  */
-int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]);
+int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch);
 /** Gets the size of an <code>LPCNetState</code> structure.
  * @returns The size in bytes.
--- a/dnn/lpcnet_demo.c
+++ b/dnn/lpcnet_demo.c
@ -37,6 +37,7 @@
 #include "freq.h"
 #include "os_support.h"
 #include "fargan.h"
 #include "cpu_support.h"
 #ifdef USE_WEIGHTS_FILE
 # if __unix__
@ -99,12 +100,14 @@ void usage(void) {
 int main(int argc, char **argv) {
    int mode=0;
    int arch;
    FILE *fin, *fout;
 #ifdef USE_WEIGHTS_FILE
    int len;
    unsigned char *data;
    const char *filename = "weights_blob.bin";
 #endif
    arch = opus_select_arch();
    if (argc < 4) usage();
    if (strcmp(argv[1], "-features") == 0) mode=MODE_FEATURES;
    else if (strcmp(argv[1], "-fargan-synthesis") == 0) mode=MODE_FARGAN_SYNTHESIS;
@ -137,7 +140,7 @@ int main(int argc, char **argv) {
            size_t ret;
            ret = fread(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fin);
            if (feof(fin) || ret != LPCNET_FRAME_SIZE) break;
-            lpcnet_compute_single_frame_features(net, pcm, features);
+            lpcnet_compute_single_frame_features(net, pcm, features, arch);
            fwrite(features, sizeof(float), NB_TOTAL_FEATURES, fout);
        }
        lpcnet_encoder_destroy(net);
--- a/dnn/lpcnet_enc.c
+++ b/dnn/lpcnet_enc.c
@ -95,7 +95,7 @@ static void biquad(float *y, float mem[2], const float *x, const float *b, const
 #define celt_log10(x) (0.3010299957f*celt_log2(x))
-void compute_frame_features(LPCNetEncState *st, const float *in) {
+void compute_frame_features(LPCNetEncState *st, const float *in, int arch) {
  float aligned_in[FRAME_SIZE];
  int i;
  float Ly[NB_BANDS];
@ -142,7 +142,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
  OPUS_COPY(&x[0], st->pitch_mem, LPC_ORDER);
  OPUS_COPY(&x[LPC_ORDER], aligned_in, FRAME_SIZE);
  OPUS_COPY(st->pitch_mem, &aligned_in[FRAME_SIZE-LPC_ORDER], LPC_ORDER);
-  celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, st->arch);
+  celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, arch);
  for (i=0;i<FRAME_SIZE;i++) {
    st->exc_buf[PITCH_MAX_PERIOD+i] = st->lp_buf[PITCH_MAX_PERIOD+i] + .7f*st->pitch_filt;
    st->pitch_filt = st->lp_buf[PITCH_MAX_PERIOD+i];
@ -152,7 +152,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
  {
    double ener1;
    float *buf = st->exc_buf;
-    celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, st->arch);
+    celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, arch);
    ener0 = celt_inner_prod_c(&buf[PITCH_MAX_PERIOD], &buf[PITCH_MAX_PERIOD], FRAME_SIZE);
    ener1 = celt_inner_prod_c(&buf[0], &buf[0], FRAME_SIZE-1);
    /*printf("%f\n", st->frame_weight[sub]);*/
@ -165,7 +165,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
    }
    /*printf("\n");*/
  }
-  st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features);
+  st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features, arch);
 }
 void process_single_frame(LPCNetEncState *st, FILE *ffeat) {
@ -196,26 +196,26 @@ void preemphasis(float *y, float *mem, const float *x, float coef, int N) {
  }
 }
-static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES]) {
+static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES], int arch) {
  preemphasis(x, &st->mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
-  compute_frame_features(st, x);
+  compute_frame_features(st, x, arch);
  process_single_frame(st, NULL);
  OPUS_COPY(features, &st->features[0], NB_TOTAL_FEATURES);
  return 0;
 }
-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]) {
+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch) {
  int i;
  float x[FRAME_SIZE];
  for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
-  lpcnet_compute_single_frame_features_impl(st, x, features);
+  lpcnet_compute_single_frame_features_impl(st, x, features, arch);
  return 0;
 }
-int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]) {
+int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch) {
  int i;
  float x[FRAME_SIZE];
  for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
-  lpcnet_compute_single_frame_features_impl(st, x, features);
+  lpcnet_compute_single_frame_features_impl(st, x, features, arch);
  return 0;
 }
--- a/dnn/lpcnet_plc.c
+++ b/dnn/lpcnet_plc.c
@ -33,6 +33,7 @@
 #include "plc_data.h"
 #include "os_support.h"
 #include "common.h"
 #include "cpu_support.h"
 #ifndef M_PI
 #define M_PI 3.141592653
@ -54,6 +55,7 @@ void lpcnet_plc_reset(LPCNetPLCState *st) {
 int lpcnet_plc_init(LPCNetPLCState *st) {
  int ret;
  st->arch = opus_select_arch();
  fargan_init(&st->fargan);
  lpcnet_encoder_init(&st->enc);
  st->analysis_pos = PLC_BUF_SIZE;
@ -109,10 +111,10 @@ static void compute_plc_pred(LPCNetPLCState *st, float *out, const float *in) {
  float dense_out[PLC_DENSE1_OUT_SIZE];
  PLCNetState *net = &st->plc_net;
  celt_assert(st->loaded);
-  _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in);
+  _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in, st->arch);
-  compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out);
+  compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out, st->arch);
-  compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state);
+  compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state, st->arch);
-  _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state);
+  _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state, st->arch);
 }
 static int get_fec_or_pred(LPCNetPLCState *st, float *out) {
@ -164,7 +166,7 @@ int lpcnet_plc_conceal(LPCNetPLCState *st, opus_int16 *pcm) {
      float plc_features[2*NB_BANDS+NB_FEATURES+1];
      for (i=0;i<FRAME_SIZE;i++) x[i] = 32768.f*st->pcm[st->analysis_pos+i];
      burg_cepstral_analysis(plc_features, x);
-      lpcnet_compute_single_frame_features_float(&st->enc, x, st->features);
+      lpcnet_compute_single_frame_features_float(&st->enc, x, st->features, st->arch);
      if ((st->analysis_gap && count > 0) || count > 1) {
        queue_features(st, st->features);
        OPUS_COPY(&plc_features[2*NB_BANDS], st->features, NB_FEATURES);
--- a/dnn/lpcnet_private.h
+++ b/dnn/lpcnet_private.h
@ -24,7 +24,6 @@
 struct LPCNetEncState{
  PitchDNNState pitchdnn;
  int arch;
  float analysis_mem[OVERLAP_SIZE];
  float mem_preemph;
  kiss_fft_cpx prev_if[PITCH_IF_MAX_FREQ];
@ -67,7 +66,7 @@ struct LPCNetPLCState {
 void preemphasis(float *y, float *mem, const float *x, float coef, int N);
-void compute_frame_features(LPCNetEncState *st, const float *in);
+void compute_frame_features(LPCNetEncState *st, const float *in, int arch);
 void lpcnet_reset_signal(LPCNetState *lpcnet);
 void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features);
@ -79,7 +78,6 @@ void lpcnet_synthesize_tail_impl(LPCNetState *lpcnet, opus_int16 *output, int N,
 void lpcnet_synthesize_impl(LPCNetState *lpcnet, const float *features, opus_int16 *output, int N, int preload);
 void lpcnet_synthesize_blend_impl(LPCNetState *lpcnet, const opus_int16 *pcm_in, opus_int16 *output, int N);
 void process_single_frame(LPCNetEncState *st, FILE *ffeat);
 int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]);
 void process_single_frame(LPCNetEncState *st, FILE *ffeat);
--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@ -69,50 +69,16 @@ static OPUS_INLINE float relu(float x)
   return x < 0 ? 0 : x;
 }
-static void compute_linear(const LinearLayer *linear, float *out, const float *in)
+void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch)
 {
-   int i, M, N;
+   compute_linear(layer, output, input, arch);
   const float *bias;
   celt_assert(in != out);
   bias = linear->bias;
   M = linear->nb_inputs;
   N = linear->nb_outputs;
   if (linear->float_weights != NULL) {
     if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
     else sgemv(out, linear->float_weights, N, M, N, in);
   } else if (linear->weights != NULL) {
     if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
     else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
     /* Only use SU biases on for integer matrices on SU archs. */
 #ifdef USE_SU_BIAS
     bias = linear->subias;
 #endif
   }
   else OPUS_CLEAR(out, N);
   if (bias != NULL) {
      for (i=0;i<N;i++) out[i] += bias[i];
   }
   if (linear->diag) {
      /* Diag is only used for GRU recurrent weights. */
      celt_assert(3*M == N);
      for (i=0;i<M;i++) {
         out[i] += linear->diag[i]*in[i];
         out[i+M] += linear->diag[i+M]*in[i];
         out[i+2*M] += linear->diag[i+2*M]*in[i];
      }
   }
 }
 void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation)
 {
   compute_linear(layer, output, input);
   compute_activation(output, output, layer->nb_outputs, activation);
 }
 #define MAX_RNN_NEURONS_ALL IMAX(IMAX(FARGAN_MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS)
-void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in)
+void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch)
 {
  int i;
  int N;
@ -129,8 +95,8 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re
  h = &zrh[2*N];
  celt_assert(recurrent_weights->nb_outputs <= 3*MAX_RNN_NEURONS_ALL);
  celt_assert(in != state);
-  compute_linear(input_weights, zrh, in);
+  compute_linear(input_weights, zrh, in, arch);
-  compute_linear(recurrent_weights, recur, state);
+  compute_linear(recurrent_weights, recur, state, arch);
  for (i=0;i<2*N;i++)
     zrh[i] += recur[i];
  compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
@ -143,12 +109,12 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re
     state[i] = h[i];
 }
-void compute_glu(const LinearLayer *layer, float *output, const float *input)
+void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch)
 {
   int i;
   float act2[MAX_INPUTS];
   celt_assert(layer->nb_inputs == layer->nb_outputs);
-   compute_linear(layer, act2, input);
+   compute_linear(layer, act2, input, arch);
   compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID);
   if (input == output) {
     /* Give a vectorization hint to the compiler for the in-place case. */
@ -194,7 +160,7 @@ void compute_activation(float *output, const float *input, int N, int activation
   }
 }
-void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input)
+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch)
 {
   LinearLayer matrix;
   celt_assert(input != output);
@ -207,7 +173,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *
   matrix.nb_inputs = layer->nb_inputs;
   matrix.nb_outputs = layer->nb_neurons;
   matrix.scale = NULL;
-   compute_linear(&matrix, output, input);
+   compute_linear(&matrix, output, input, arch);
   compute_activation(output, output, layer->nb_neurons, layer->activation);
 }
@ -218,7 +184,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *
 #endif
 #define MAX_IDX_SIZE 8192
-void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input)
+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch)
 {
  LinearLayer in_matrix, rec_matrix;
  int i, M, N;
@ -262,25 +228,25 @@ void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *stat
  rec_matrix.float_weights = NULL;
 #endif
  rec_matrix.weights_idx = NULL;
-  compute_generic_gru(&in_matrix, &rec_matrix, state, input);
+  compute_generic_gru(&in_matrix, &rec_matrix, state, input, arch);
 }
 #define MAX_CONV_INPUTS_ALL DRED_MAX_CONV_INPUTS
-void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation)
+void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch)
 {
   float tmp[MAX_CONV_INPUTS_ALL];
   celt_assert(input != output);
   celt_assert(layer->nb_inputs <= MAX_CONV_INPUTS_ALL);
   OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);
   OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
-   compute_linear(layer, output, tmp);
+   compute_linear(layer, output, tmp, arch);
   compute_activation(output, output, layer->nb_outputs, activation);
   OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
 }
-void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation)
+void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch)
 {
   float tmp[MAX_CONV_INPUTS_ALL];
   int ksize = layer->nb_inputs/input_size;
@ -290,7 +256,7 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl
   if (dilation==1) OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);
   else for (i=0;i<ksize-1;i++) OPUS_COPY(&tmp[i*input_size], &mem[i*input_size*dilation], input_size);
   OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
-   compute_linear(layer, output, tmp);
+   compute_linear(layer, output, tmp, arch);
   compute_activation(output, output, layer->nb_outputs, activation);
   if (dilation==1) OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
   else {
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@ -126,18 +126,18 @@ typedef struct {
  int dim;
 } EmbeddingLayer;
-void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation);
+void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch);
-void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in);
+void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch);
-void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation);
+void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch);
-void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation);
+void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch);
-void compute_glu(const LinearLayer *layer, float *output, const float *input);
+void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch);
-void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation);
+void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation, int arch);
 void compute_activation(float *output, const float *input, int N, int activation);
-void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input);
+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch);
-void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input);
+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch);
@ -189,4 +189,25 @@ int gru_init(GRULayer *layer, const WeightArray *arrays,
 void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);
 void compute_linear_c(const LinearLayer *linear, float *out, const float *in);
 #if defined(OPUS_X86_MAY_HAVE_SSE2)
 #include "x86/dnn_x86.h"
 #endif
 #ifndef OVERRIDE_COMPUTE_LINEAR
 #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_c(linear, out, in))
 #endif
 #if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
 #if defined(_MSC_VER)
 #pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
 #else
 #warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
 #endif
 #endif
 #endif /* NNET_H_ */
--- a/dnn/nnet_arch.h
+++ b/dnn/nnet_arch.h
@ -0,0 +1,76 @@
 /* Copyright (c) 2018-2019 Mozilla
                 2023 Amazon */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef NNET_ARCH_H
 #define NNET_ARCH_H
 #include "nnet.h"
 #include "arch.h"
 #include "os_support.h"
 #include "vec.h"
 #define CAT_SUFFIX2(a,b) a ## b
 #define CAT_SUFFIX(a,b) CAT_SUFFIX2(a, b)
 #define RTCD_SUF(name) CAT_SUFFIX(name, RTCD_ARCH)
 void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const float *in)
 {
   int i, M, N;
   const float *bias;
   celt_assert(in != out);
   bias = linear->bias;
   M = linear->nb_inputs;
   N = linear->nb_outputs;
   if (linear->float_weights != NULL) {
     if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
     else sgemv(out, linear->float_weights, N, M, N, in);
   } else if (linear->weights != NULL) {
     if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
     else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
     /* Only use SU biases on for integer matrices on SU archs. */
 #ifdef USE_SU_BIAS
     bias = linear->subias;
 #endif
   }
   else OPUS_CLEAR(out, N);
   if (bias != NULL) {
      for (i=0;i<N;i++) out[i] += bias[i];
   }
   if (linear->diag) {
      /* Diag is only used for GRU recurrent weights. */
      celt_assert(3*M == N);
      for (i=0;i<M;i++) {
         out[i] += linear->diag[i]*in[i];
         out[i+M] += linear->diag[i+M]*in[i];
         out[i+2*M] += linear->diag[i+2*M]*in[i];
      }
   }
 }
 #endif
--- a/dnn/nnet_default.c
+++ b/dnn/nnet_default.c
@ -0,0 +1,35 @@
 /* Copyright (c) 2018-2019 Mozilla
                 2023 Amazon */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #define RTCD_ARCH c
 #include "nnet_arch.h"
--- a/dnn/pitchdnn.c
+++ b/dnn/pitchdnn.c
@ -12,7 +12,8 @@
 float compute_pitchdnn(
    PitchDNNState *st,
    const float *if_features,
-    const float *xcorr_features
+    const float *xcorr_features,
    int arch
    )
 {
  float if1_out[DENSE_IF_UPSAMPLER_1_OUT_SIZE];
@ -28,16 +29,16 @@ float compute_pitchdnn(
  float count=0;
  PitchDNN *model = &st->model;
  /* IF */
-  compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH);
+  compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH, arch);
-  compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH);
+  compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH, arch);
  /* xcorr*/
  OPUS_COPY(&conv1_tmp1[1], xcorr_features, NB_XCORR_FEATURES);
  compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH);
  compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH);
-  compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH);
+  compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH, arch);
-  compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out);
+  compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out, arch);
-  compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR);
+  compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR, arch);
  for (i=0;i<180;i++) {
    if (output[i] > maxval) {
      pos = i;
@ -65,7 +66,6 @@ void pitchdnn_init(PitchDNNState *st)
  ret = 0;
 #endif
  celt_assert(ret == 0);
  /* FIXME: perform arch detection. */
 }
 int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len) {
--- a/dnn/pitchdnn.h
+++ b/dnn/pitchdnn.h
@ -27,7 +27,8 @@ int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len);
 float compute_pitchdnn(
    PitchDNNState *st,
    const float *if_features,
-    const float *xcorr_features
+    const float *xcorr_features,
    int arch
    );
 #endif
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@ -655,11 +655,6 @@ static inline mm256i_emu opus_mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a,
  return res;
 }
 #if defined(_MSC_VER)
 #pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
 #else
 #warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
 #endif
 #else
--- a/dnn/x86/dnn_x86.h
+++ b/dnn/x86/dnn_x86.h
@ -0,0 +1,78 @@
 /* Copyright (c) 2011-2019 Mozilla
                 2023 Amazon */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifndef DNN_X86_H
 #define DNN_X86_H
 #include "cpu_support.h"
 #include "opus_types.h"
 #if defined(OPUS_X86_MAY_HAVE_SSE2)
 void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in);
 #endif
 #if defined(OPUS_X86_MAY_HAVE_SSE4_1)
 void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in);
 #endif
 #if defined(OPUS_X86_MAY_HAVE_AVX2)
 void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in);
 #endif
 #if defined(OPUS_X86_PRESUME_AVX2)
 #define OVERRIDE_COMPUTE_LINEAR
 #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in))
 #elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
 #define OVERRIDE_COMPUTE_LINEAR
 #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in))
 #elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
 #define OVERRIDE_COMPUTE_LINEAR
 #define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in))
 #elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))
 extern void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
                    const LinearLayer *linear,
                    float *out,
                    const float *in
                    );
 #define OVERRIDE_COMPUTE_LINEAR
 #define compute_linear(linear, out, in, arch) \
    ((*DNN_COMPUTE_LINEAR_IMPL[(arch) & OPUS_ARCHMASK])(linear, out, in))
 #endif
 #endif /* DNN_X86_H */
--- a/dnn/x86/nnet_avx2.c
+++ b/dnn/x86/nnet_avx2.c
@ -0,0 +1,38 @@
 /* Copyright (c) 2018-2019 Mozilla
                 2023 Amazon */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #ifndef __AVX2__
 #error nnet_avx2.c is being compiled without AVX2 enabled
 #endif
 #define RTCD_ARCH avx2
 #include "nnet_arch.h"
--- a/dnn/x86/nnet_sse2.c
+++ b/dnn/x86/nnet_sse2.c
@ -0,0 +1,38 @@
 /* Copyright (c) 2018-2019 Mozilla
                 2023 Amazon */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #ifndef __SSE2__
 #error nnet_sse2.c is being compiled without SSE2 enabled
 #endif
 #define RTCD_ARCH sse2
 #include "nnet_arch.h"
--- a/dnn/x86/nnet_sse4_1.c
+++ b/dnn/x86/nnet_sse4_1.c
@ -0,0 +1,38 @@
 /* Copyright (c) 2018-2019 Mozilla
                 2023 Amazon */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #ifndef __SSE4_1__
 #error nnet_sse4_1.c is being compiled without SSE4.1 enabled
 #endif
 #define RTCD_ARCH sse4_1
 #include "nnet_arch.h"
--- a/dnn/x86/x86_dnn_map.c
+++ b/dnn/x86/x86_dnn_map.c
@ -0,0 +1,54 @@
 /* Copyright (c) 2018-2019 Mozilla
                 2023 Amazon */
 /*
   Redistribution and use in source and binary forms, with or without
   modification, are permitted provided that the following conditions
   are met:
   - Redistributions of source code must retain the above copyright
   notice, this list of conditions and the following disclaimer.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions and the following disclaimer in the
   documentation and/or other materials provided with the distribution.
   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */
 #ifdef HAVE_CONFIG_H
 #include "config.h"
 #endif
 #include "x86/x86cpu.h"
 #include "nnet.h"
 #if defined(OPUS_HAVE_RTCD)
 #if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_AVX2))
 void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
         const LinearLayer *linear,
         float *out,
         const float *in
 ) = {
  compute_linear_c,                /* non-sse */
  compute_linear_c,
  MAY_HAVE_SSE2(compute_linear),
  MAY_HAVE_SSE4_1(compute_linear), /* sse4.1  */
  MAY_HAVE_AVX2(compute_linear)  /* avx  */
 };
 #endif
 #endif
--- a/lpcnet_headers.mk
+++ b/lpcnet_headers.mk
@ -12,7 +12,9 @@ dnn/vec.h \
 dnn/vec_avx.h \
 dnn/vec_neon.h \
 dnn/pitchdnn.h \
-dnn/pitchdnn_data.h
+dnn/pitchdnn_data.h \
 dnn/x86/dnn_x86.h \
 dnn/nnet_arch.h
 DRED_HEAD = \
 silk/dred_coding.h \
--- a/lpcnet_sources.mk
+++ b/lpcnet_sources.mk
@ -7,6 +7,7 @@ dnn/lpcnet_enc.c \
 dnn/lpcnet_plc.c \
 dnn/lpcnet_tables.c \
 dnn/nnet.c \
 dnn/nnet_default.c \
 dnn/plc_data.c \
 dnn/parse_lpcnet_weights.c \
 dnn/pitchdnn.c \
@ -21,3 +22,8 @@ dnn/dred_rdovae_stats_data.c \
 silk/dred_encoder.c \
 silk/dred_coding.c \
 silk/dred_decoder.c
 DNN_SOURCES_X86_RTCD = dnn/x86/x86_dnn_map.c
 DNN_SOURCES_AVX2 = dnn/x86/nnet_avx2.c
 DNN_SOURCES_SSE4_1 = dnn/x86/nnet_sse4_1.c
 DNN_SOURCES_SSE2 = dnn/x86/nnet_sse2.c
--- a/silk/dred_encoder.c
+++ b/silk/dred_encoder.c
@ -87,7 +87,7 @@ void dred_encoder_init(DREDEnc* enc, opus_int32 Fs, int channels)
    dred_encoder_reset(enc);
 }
-static void dred_process_frame(DREDEnc *enc)
+static void dred_process_frame(DREDEnc *enc, int arch)
 {
    float feature_buffer[2 * 36];
    float input_buffer[2*DRED_NUM_FEATURES] = {0};
@ -97,15 +97,15 @@ static void dred_process_frame(DREDEnc *enc)
    OPUS_MOVE(enc->latents_buffer + DRED_LATENT_DIM, enc->latents_buffer, (DRED_MAX_FRAMES - 1) * DRED_LATENT_DIM);
    /* calculate LPCNet features */
-    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer);
+    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer, arch);
-    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36);
+    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36, arch);
    /* prepare input buffer (discard LPC coefficients) */
    OPUS_COPY(input_buffer, feature_buffer, DRED_NUM_FEATURES);
    OPUS_COPY(input_buffer + DRED_NUM_FEATURES, feature_buffer + 36, DRED_NUM_FEATURES);
    /* run RDOVAE encoder */
-    dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer);
+    dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer, arch);
    enc->latents_buffer_fill = IMIN(enc->latents_buffer_fill+1, DRED_NUM_REDUNDANCY_FRAMES);
 }
@ -188,7 +188,7 @@ static void dred_convert_to_16k(DREDEnc *enc, const float *in, int in_len, float
    }
 }
-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay)
+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch)
 {
    int curr_offset16k;
    int frame_size16k = frame_size * 16000 / enc->Fs;
@ -206,7 +206,7 @@ void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int ex
        if (enc->input_buffer_fill >= 2*DRED_FRAME_SIZE)
        {
            curr_offset16k += 320;
-            dred_process_frame(enc);
+            dred_process_frame(enc, arch);
            enc->input_buffer_fill -= 2*DRED_FRAME_SIZE;
            OPUS_MOVE(&enc->input_buffer[0], &enc->input_buffer[2*DRED_FRAME_SIZE], enc->input_buffer_fill);
            /* 15 ms (6*2.5 ms) is the ideal offset for DRED because it corresponds to our vocoder look-ahead. */
--- a/silk/dred_encoder.h
+++ b/silk/dred_encoder.h
@ -64,7 +64,7 @@ void dred_encoder_reset(DREDEnc* enc);
 void dred_deinit_encoder(DREDEnc *enc);
-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay);
+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch);
 int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes);
--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@ -1424,7 +1424,7 @@ int opus_dred_process(OpusDREDDecoder *dred_dec, const OpusDRED *src, OpusDRED *
      OPUS_COPY(dst, src, 1);
   if (dst->process_stage == 2)
      return OPUS_OK;
-   DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents);
+   DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents, dred_dec->arch);
   dst->process_stage = 2;
   return OPUS_OK;
 #else
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@ -1715,7 +1715,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 #ifdef ENABLE_DRED
    if ( st->dred_duration > 0 && st->dred_encoder.loaded ) {
        /* DRED Encoder */
-        dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer );
+        dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer, st->arch );
    } else {
        st->dred_encoder.latents_buffer_fill = 0;
    }