Adding RTCD for DNN code

Starting with compute_linear()
2023-11-13 18:26:31 -05:00 · 2023-11-13 18:26:31 -05:00 · 2e034f6f31
commit 2e034f6f31
parent b0620c0bf9
31 changed files with 539 additions and 165 deletions
--- a/Makefile.am
+++ b/Makefile.am
@ -50,18 +50,30 @@ if CPU_X86
 if HAVE_RTCD
 CELT_SOURCES += $(CELT_SOURCES_X86_RTCD)
 SILK_SOURCES += $(SILK_SOURCES_X86_RTCD)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_X86_RTCD)
+endif
 endif
 if HAVE_SSE
 CELT_SOURCES += $(CELT_SOURCES_SSE)
 endif
 if HAVE_SSE2
 CELT_SOURCES += $(CELT_SOURCES_SSE2)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_SSE2)
+endif
 endif
 if HAVE_SSE4_1
 CELT_SOURCES += $(CELT_SOURCES_SSE4_1)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_SSE4_1)
+endif
 endif
 if HAVE_AVX2
 CELT_SOURCES += $(CELT_SOURCES_AVX2)
+if ENABLE_DEEP_PLC
+LPCNET_SOURCES += $(DNN_SOURCES_AVX2)
+endif
 endif
 endif

@ -398,19 +410,22 @@ $(SSE_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS)
 endif

 if HAVE_SSE2
-SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo)
+SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) \
+           $(DNN_SOURCES_SSE2:.c=.lo)
 $(SSE2_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS)
 endif

 if HAVE_SSE4_1
 SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \
+             $(DNN_SOURCES_SSE4_1:.c=.lo) \
             $(SILK_SOURCES_SSE4_1:.c=.lo) \
             $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo)
 $(SSE4_1_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS)
 endif

 if HAVE_AVX2
-AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo)
+AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) \
+           $(DNN_SOURCES_AVX2:.c=.lo)
 $(AVX2_OBJ): CFLAGS += $(OPUS_X86_AVX2_CFLAGS)
 endif

--- a/celt/x86/x86cpu.h
+++ b/celt/x86/x86cpu.h
@ -47,7 +47,7 @@
 # endif

 # if defined(OPUS_X86_MAY_HAVE_AVX2)
-#  define MAY_HAVE_AVX2(name) name ## _avx
+#  define MAY_HAVE_AVX2(name) name ## _avx2
 # else
 #  define MAY_HAVE_AVX2(name) name ## _c
 # endif
--- a/dnn/dred_rdovae_dec.c
+++ b/dnn/dred_rdovae_dec.c
@ -42,33 +42,35 @@ static void conv1_cond_init(float *mem, int len, int dilation, int *init)
    *init = 1;
 }

-void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents)
+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch)
 {
    int i;
    RDOVAEDecState dec;
    memset(&dec, 0, sizeof(dec));
-    dred_rdovae_dec_init_states(&dec, model, state);
+    dred_rdovae_dec_init_states(&dec, model, state, arch);
    for (i = 0; i < 2*nb_latents; i += 2)
    {
        dred_rdovae_decode_qframe(
            &dec,
            model,
            &features[2*i*DRED_NUM_FEATURES],
-            &latents[(i/2)*DRED_LATENT_DIM]);
+            &latents[(i/2)*DRED_LATENT_DIM],
+            arch);
    }
 }

 void dred_rdovae_dec_init_states(
    RDOVAEDecState *h,            /* io: state buffer handle */
    const RDOVAEDec *model,
-    const float *initial_state  /* i: initial state */
+    const float *initial_state,  /* i: initial state */
+    int arch
    )
 {
    float hidden[DEC_HIDDEN_INIT_OUT_SIZE];
    float state_init[DEC_GRU1_STATE_SIZE+DEC_GRU2_STATE_SIZE+DEC_GRU3_STATE_SIZE+DEC_GRU4_STATE_SIZE+DEC_GRU5_STATE_SIZE];
    int counter=0;
-    compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH);
-    compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH);
+    compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH, arch);
+    compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH, arch);
    OPUS_COPY(h->gru1_state, state_init, DEC_GRU1_STATE_SIZE);
    counter += DEC_GRU1_STATE_SIZE;
    OPUS_COPY(h->gru2_state, &state_init[counter], DEC_GRU2_STATE_SIZE);
@ -86,7 +88,8 @@ void dred_rdovae_decode_qframe(
    RDOVAEDecState *dec_state,       /* io: state buffer handle */
    const RDOVAEDec *model,
    float *qframe,              /* o: quadruple feature frame (four concatenated frames in reverse order) */
-    const float *input          /* i: latent vector */
+    const float *input,          /* i: latent vector */
+    int arch
    )
 {
    float buffer[DEC_DENSE1_OUT_SIZE + DEC_GRU1_OUT_SIZE + DEC_GRU2_OUT_SIZE + DEC_GRU3_OUT_SIZE + DEC_GRU4_OUT_SIZE + DEC_GRU5_OUT_SIZE
@ -94,43 +97,43 @@ void dred_rdovae_decode_qframe(
    int output_index = 0;

    /* run encoder stack and concatenate output in buffer*/
-    compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH);
+    compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);
    output_index += DEC_DENSE1_OUT_SIZE;

-    compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer);
-    compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state);
+    compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer, arch);
+    compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state, arch);
    output_index += DEC_GRU1_OUT_SIZE;
    conv1_cond_init(dec_state->conv1_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += DEC_CONV1_OUT_SIZE;

-    compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer);
-    compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state);
+    compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer, arch);
+    compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state, arch);
    output_index += DEC_GRU2_OUT_SIZE;
    conv1_cond_init(dec_state->conv2_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += DEC_CONV2_OUT_SIZE;

-    compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer);
-    compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state);
+    compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer, arch);
+    compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state, arch);
    output_index += DEC_GRU3_OUT_SIZE;
    conv1_cond_init(dec_state->conv3_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += DEC_CONV3_OUT_SIZE;

-    compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer);
-    compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state);
+    compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer, arch);
+    compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state, arch);
    output_index += DEC_GRU4_OUT_SIZE;
    conv1_cond_init(dec_state->conv4_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += DEC_CONV4_OUT_SIZE;

-    compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer);
-    compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state);
+    compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer, arch);
+    compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state, arch);
    output_index += DEC_GRU5_OUT_SIZE;
    conv1_cond_init(dec_state->conv5_state, output_index, 1, &dec_state->initialized);
-    compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += DEC_CONV5_OUT_SIZE;

-    compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR);
+    compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR, arch);
 }
--- a/dnn/dred_rdovae_dec.h
+++ b/dnn/dred_rdovae_dec.h
@ -46,8 +46,8 @@ struct RDOVAEDecStruct {
  float conv5_state[DEC_CONV5_STATE_SIZE];
 };

-void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state);
-void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z);
-void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents);
+void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state, int arch);
+void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z, int arch);
+void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch);

 #endif
--- a/dnn/dred_rdovae_enc.c
+++ b/dnn/dred_rdovae_enc.c
@ -50,7 +50,8 @@ void dred_rdovae_encode_dframe(
    const RDOVAEEnc *model,
    float *latents,                 /* o: latent vector */
    float *initial_state,           /* o: initial state */
-    const float *input              /* i: double feature frame (concatenated) */
+    const float *input,              /* i: double feature frame (concatenated) */
+    int arch
    )
 {
    float padded_latents[DRED_PADDED_LATENT_DIM];
@ -61,49 +62,49 @@ void dred_rdovae_encode_dframe(
    int output_index = 0;

    /* run encoder stack and concatenate output in buffer*/
-    compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH);
+    compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch);
    output_index += ENC_DENSE1_OUT_SIZE;

-    compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer);
+    compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer, arch);
    OPUS_COPY(&buffer[output_index], enc_state->gru1_state, ENC_GRU1_OUT_SIZE);
    output_index += ENC_GRU1_OUT_SIZE;
    conv1_cond_init(enc_state->conv1_state, output_index, 1, &enc_state->initialized);
-    compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH);
+    compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch);
    output_index += ENC_CONV1_OUT_SIZE;

-    compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer);
+    compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer, arch);
    OPUS_COPY(&buffer[output_index], enc_state->gru2_state, ENC_GRU2_OUT_SIZE);
    output_index += ENC_GRU2_OUT_SIZE;
    conv1_cond_init(enc_state->conv2_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
    output_index += ENC_CONV2_OUT_SIZE;

-    compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer);
+    compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer, arch);
    OPUS_COPY(&buffer[output_index], enc_state->gru3_state, ENC_GRU3_OUT_SIZE);
    output_index += ENC_GRU3_OUT_SIZE;
    conv1_cond_init(enc_state->conv3_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
    output_index += ENC_CONV3_OUT_SIZE;

-    compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer);
+    compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer, arch);
    OPUS_COPY(&buffer[output_index], enc_state->gru4_state, ENC_GRU4_OUT_SIZE);
    output_index += ENC_GRU4_OUT_SIZE;
    conv1_cond_init(enc_state->conv4_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
    output_index += ENC_CONV4_OUT_SIZE;

-    compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer);
+    compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer, arch);
    OPUS_COPY(&buffer[output_index], enc_state->gru5_state, ENC_GRU5_OUT_SIZE);
    output_index += ENC_GRU5_OUT_SIZE;
    conv1_cond_init(enc_state->conv5_state, output_index, 2, &enc_state->initialized);
-    compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH);
+    compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH, arch);
    output_index += ENC_CONV5_OUT_SIZE;

-    compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR);
+    compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR, arch);
    OPUS_COPY(latents, padded_latents, DRED_LATENT_DIM);

    /* next, calculate initial state */
-    compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH);
-    compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR);
+    compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH, arch);
+    compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR, arch);
    OPUS_COPY(initial_state, padded_state, DRED_STATE_DIM);
 }
--- a/dnn/dred_rdovae_enc.h
+++ b/dnn/dred_rdovae_enc.h
@ -46,7 +46,7 @@ struct RDOVAEEncStruct {
    float conv5_state[2*ENC_CONV5_STATE_SIZE];
 };

-void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input);
+void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input, int arch);


 #endif
--- a/dnn/dump_data.c
+++ b/dnn/dump_data.c
@ -42,6 +42,7 @@
 #include "lpcnet.h"
 #include "lpcnet_private.h"
 #include "os_support.h"
+#include "cpu_support.h"


 static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) {
@ -135,7 +136,9 @@ int main(int argc, char **argv) {
  FILE *fnoise = NULL;
  float noise_gain = 0;
  long noise_size=0;
+  int arch;
  srand(getpid());
+  arch = opus_select_arch();
  st = lpcnet_encoder_create();
  argv0=argv[0];
  if (argc == 5 && strcmp(argv[1], "-btrain")==0) {
@ -244,7 +247,7 @@ int main(int argc, char **argv) {
    for (i=0;i<FRAME_SIZE;i++) x[i] += rand()/(float)RAND_MAX - .5;
    /* PCM is delayed by 1/2 frame to make the features centered on the frames. */
    for (i=0;i<FRAME_SIZE-TRAINING_OFFSET;i++) pcm[i+TRAINING_OFFSET] = float2short(x[i]);
-    compute_frame_features(st, x);
+    compute_frame_features(st, x, arch);

    if (fpcm) {
        compute_noise(noisebuf, noise_std);
--- a/dnn/fargan.c
+++ b/dnn/fargan.c
@ -36,6 +36,7 @@
 #include "pitch.h"
 #include "nnet.h"
 #include "lpcnet_private.h"
+#include "cpu_support.h"

 #define FARGAN_FEATURES (NB_FEATURES)

@ -52,9 +53,9 @@ static void compute_fargan_cond(FARGANState *st, float *cond, const float *featu
  OPUS_COPY(&dense_in[NB_FEATURES], &model->cond_net_pembed.float_weights[IMAX(0,IMIN(period-32, 224))*COND_NET_PEMBED_OUT_SIZE], COND_NET_PEMBED_OUT_SIZE);
  OPUS_COPY(dense_in, features, NB_FEATURES);

-  compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH);
-  compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH);
-  compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH);
+  compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH, st->arch);
+  compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH, st->arch);
+  compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH, st->arch);
 }

 static void fargan_deemphasis(float *pcm, float *deemph_mem) {
@ -84,7 +85,7 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
  celt_assert(st->cont_initialized);
  model = &st->model;

-  compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR);
+  compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR, st->arch);
  gain = exp(gain);
  gain_1 = 1.f/(1e-5f + gain);

@ -100,26 +101,26 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
  OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE], pred, FARGAN_SUBFRAME_SIZE+4);
  OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE+FARGAN_SUBFRAME_SIZE+4], prev, FARGAN_SUBFRAME_SIZE);

-  compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH);
+  compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH, st->arch);
  celt_assert(SIG_NET_FWC0_GLU_GATE_OUT_SIZE == model->sig_net_fwc0_glu_gate.nb_outputs);
-  compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in);
+  compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in, st->arch);

-  compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID);
+  compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID, st->arch);

  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+i] = pitch_gate[0]*pred[i+2];
  OPUS_COPY(&gru1_in[SIG_NET_FWC0_GLU_GATE_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
-  compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in);
-  compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state);
+  compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in, st->arch);
+  compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state, st->arch);

  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru2_in[SIG_NET_GRU1_OUT_SIZE+i] = pitch_gate[1]*pred[i+2];
  OPUS_COPY(&gru2_in[SIG_NET_GRU1_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
-  compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in);
-  compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state);
+  compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in, st->arch);
+  compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state, st->arch);

  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) gru3_in[SIG_NET_GRU2_OUT_SIZE+i] = pitch_gate[2]*pred[i+2];
  OPUS_COPY(&gru3_in[SIG_NET_GRU2_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);
-  compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in);
-  compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state);
+  compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in, st->arch);
+  compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state, st->arch);

  OPUS_COPY(skip_cat, gru2_in, SIG_NET_GRU1_OUT_SIZE);
  OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE], gru3_in, SIG_NET_GRU2_OUT_SIZE);
@ -127,10 +128,10 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond,
  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+i] = pitch_gate[3]*pred[i+2];
  OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE+SIG_NET_GRU3_OUT_SIZE+SIG_NET_FWC0_CONV_OUT_SIZE+FARGAN_SUBFRAME_SIZE], prev, FARGAN_SUBFRAME_SIZE);

-  compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH);
-  compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out);
+  compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH, st->arch);
+  compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out, st->arch);

-  compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH);
+  compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH, st->arch);
  for (i=0;i<FARGAN_SUBFRAME_SIZE;i++) pcm[i] *= gain;

  OPUS_MOVE(st->pitch_buf, &st->pitch_buf[FARGAN_SUBFRAME_SIZE], PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE);
@ -174,13 +175,13 @@ void fargan_init(FARGANState *st)
 {
  int ret;
  OPUS_CLEAR(st, 1);
+  st->arch = opus_select_arch();
 #ifndef USE_WEIGHTS_FILE
  ret = init_fargan(&st->model, fargan_arrays);
 #else
  ret = 0;
 #endif
  celt_assert(ret == 0);
-  /* FIXME: perform arch detection. */
 }

 int fargan_load_model(FARGANState *st, const unsigned char *data, int len) {
--- a/dnn/lpcnet.h
+++ b/dnn/lpcnet.h
@ -120,7 +120,7 @@ int lpcnet_encode(LPCNetEncState *st, const opus_int16 *pcm, unsigned char *buf)
  * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
  * @retval 0 Success
  */
-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]);
+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch);


 /** Compute features on LPCNET_FRAME_SIZE speech samples (currently 160) and output features for one 10-ms frame.
@ -129,7 +129,7 @@ int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *p
  * @param [out] features <tt>float[NB_TOTAL_FEATURES]</tt>: Four feature vectors
  * @retval 0 Success
  */
-int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]);
+int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch);

 /** Gets the size of an <code>LPCNetState</code> structure.
  * @returns The size in bytes.
--- a/dnn/lpcnet_demo.c
+++ b/dnn/lpcnet_demo.c
@ -37,6 +37,7 @@
 #include "freq.h"
 #include "os_support.h"
 #include "fargan.h"
+#include "cpu_support.h"

 #ifdef USE_WEIGHTS_FILE
 # if __unix__
@ -99,12 +100,14 @@ void usage(void) {

 int main(int argc, char **argv) {
    int mode=0;
+    int arch;
    FILE *fin, *fout;
 #ifdef USE_WEIGHTS_FILE
    int len;
    unsigned char *data;
    const char *filename = "weights_blob.bin";
 #endif
+    arch = opus_select_arch();
    if (argc < 4) usage();
    if (strcmp(argv[1], "-features") == 0) mode=MODE_FEATURES;
    else if (strcmp(argv[1], "-fargan-synthesis") == 0) mode=MODE_FARGAN_SYNTHESIS;
@ -137,7 +140,7 @@ int main(int argc, char **argv) {
            size_t ret;
            ret = fread(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fin);
            if (feof(fin) || ret != LPCNET_FRAME_SIZE) break;
-            lpcnet_compute_single_frame_features(net, pcm, features);
+            lpcnet_compute_single_frame_features(net, pcm, features, arch);
            fwrite(features, sizeof(float), NB_TOTAL_FEATURES, fout);
        }
        lpcnet_encoder_destroy(net);
--- a/dnn/lpcnet_enc.c
+++ b/dnn/lpcnet_enc.c
@ -95,7 +95,7 @@ static void biquad(float *y, float mem[2], const float *x, const float *b, const

 #define celt_log10(x) (0.3010299957f*celt_log2(x))

-void compute_frame_features(LPCNetEncState *st, const float *in) {
+void compute_frame_features(LPCNetEncState *st, const float *in, int arch) {
  float aligned_in[FRAME_SIZE];
  int i;
  float Ly[NB_BANDS];
@ -142,7 +142,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
  OPUS_COPY(&x[0], st->pitch_mem, LPC_ORDER);
  OPUS_COPY(&x[LPC_ORDER], aligned_in, FRAME_SIZE);
  OPUS_COPY(st->pitch_mem, &aligned_in[FRAME_SIZE-LPC_ORDER], LPC_ORDER);
-  celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, st->arch);
+  celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, arch);
  for (i=0;i<FRAME_SIZE;i++) {
    st->exc_buf[PITCH_MAX_PERIOD+i] = st->lp_buf[PITCH_MAX_PERIOD+i] + .7f*st->pitch_filt;
    st->pitch_filt = st->lp_buf[PITCH_MAX_PERIOD+i];
@ -152,7 +152,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
  {
    double ener1;
    float *buf = st->exc_buf;
-    celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, st->arch);
+    celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, arch);
    ener0 = celt_inner_prod_c(&buf[PITCH_MAX_PERIOD], &buf[PITCH_MAX_PERIOD], FRAME_SIZE);
    ener1 = celt_inner_prod_c(&buf[0], &buf[0], FRAME_SIZE-1);
    /*printf("%f\n", st->frame_weight[sub]);*/
@ -165,7 +165,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) {
    }
    /*printf("\n");*/
  }
-  st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features);
+  st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features, arch);
 }

 void process_single_frame(LPCNetEncState *st, FILE *ffeat) {
@ -196,26 +196,26 @@ void preemphasis(float *y, float *mem, const float *x, float coef, int N) {
  }
 }

-static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES]) {
+static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES], int arch) {
  preemphasis(x, &st->mem_preemph, x, PREEMPHASIS, FRAME_SIZE);
-  compute_frame_features(st, x);
+  compute_frame_features(st, x, arch);
  process_single_frame(st, NULL);
  OPUS_COPY(features, &st->features[0], NB_TOTAL_FEATURES);
  return 0;
 }

-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]) {
+int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch) {
  int i;
  float x[FRAME_SIZE];
  for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
-  lpcnet_compute_single_frame_features_impl(st, x, features);
+  lpcnet_compute_single_frame_features_impl(st, x, features, arch);
  return 0;
 }

-int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]) {
+int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch) {
  int i;
  float x[FRAME_SIZE];
  for (i=0;i<FRAME_SIZE;i++) x[i] = pcm[i];
-  lpcnet_compute_single_frame_features_impl(st, x, features);
+  lpcnet_compute_single_frame_features_impl(st, x, features, arch);
  return 0;
 }
--- a/dnn/lpcnet_plc.c
+++ b/dnn/lpcnet_plc.c
@ -33,6 +33,7 @@
 #include "plc_data.h"
 #include "os_support.h"
 #include "common.h"
+#include "cpu_support.h"

 #ifndef M_PI
 #define M_PI 3.141592653
@ -54,6 +55,7 @@ void lpcnet_plc_reset(LPCNetPLCState *st) {

 int lpcnet_plc_init(LPCNetPLCState *st) {
  int ret;
+  st->arch = opus_select_arch();
  fargan_init(&st->fargan);
  lpcnet_encoder_init(&st->enc);
  st->analysis_pos = PLC_BUF_SIZE;
@ -109,10 +111,10 @@ static void compute_plc_pred(LPCNetPLCState *st, float *out, const float *in) {
  float dense_out[PLC_DENSE1_OUT_SIZE];
  PLCNetState *net = &st->plc_net;
  celt_assert(st->loaded);
-  _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in);
-  compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out);
-  compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state);
-  _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state);
+  _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in, st->arch);
+  compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out, st->arch);
+  compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state, st->arch);
+  _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state, st->arch);
 }

 static int get_fec_or_pred(LPCNetPLCState *st, float *out) {
@ -164,7 +166,7 @@ int lpcnet_plc_conceal(LPCNetPLCState *st, opus_int16 *pcm) {
      float plc_features[2*NB_BANDS+NB_FEATURES+1];
      for (i=0;i<FRAME_SIZE;i++) x[i] = 32768.f*st->pcm[st->analysis_pos+i];
      burg_cepstral_analysis(plc_features, x);
-      lpcnet_compute_single_frame_features_float(&st->enc, x, st->features);
+      lpcnet_compute_single_frame_features_float(&st->enc, x, st->features, st->arch);
      if ((st->analysis_gap && count > 0) || count > 1) {
        queue_features(st, st->features);
        OPUS_COPY(&plc_features[2*NB_BANDS], st->features, NB_FEATURES);
--- a/dnn/lpcnet_private.h
+++ b/dnn/lpcnet_private.h
@ -24,7 +24,6 @@

 struct LPCNetEncState{
  PitchDNNState pitchdnn;
-  int arch;
  float analysis_mem[OVERLAP_SIZE];
  float mem_preemph;
  kiss_fft_cpx prev_if[PITCH_IF_MAX_FREQ];
@ -67,7 +66,7 @@ struct LPCNetPLCState {

 void preemphasis(float *y, float *mem, const float *x, float coef, int N);

-void compute_frame_features(LPCNetEncState *st, const float *in);
+void compute_frame_features(LPCNetEncState *st, const float *in, int arch);

 void lpcnet_reset_signal(LPCNetState *lpcnet);
 void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features);
@ -79,7 +78,6 @@ void lpcnet_synthesize_tail_impl(LPCNetState *lpcnet, opus_int16 *output, int N,
 void lpcnet_synthesize_impl(LPCNetState *lpcnet, const float *features, opus_int16 *output, int N, int preload);
 void lpcnet_synthesize_blend_impl(LPCNetState *lpcnet, const opus_int16 *pcm_in, opus_int16 *output, int N);
 void process_single_frame(LPCNetEncState *st, FILE *ffeat);
-int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]);

 void process_single_frame(LPCNetEncState *st, FILE *ffeat);

--- a/dnn/nnet.c
+++ b/dnn/nnet.c
@ -69,50 +69,16 @@ static OPUS_INLINE float relu(float x)
   return x < 0 ? 0 : x;
 }

-static void compute_linear(const LinearLayer *linear, float *out, const float *in)
+void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch)
 {
-   int i, M, N;
-   const float *bias;
-   celt_assert(in != out);
-   bias = linear->bias;
-   M = linear->nb_inputs;
-   N = linear->nb_outputs;
-   if (linear->float_weights != NULL) {
-     if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
-     else sgemv(out, linear->float_weights, N, M, N, in);
-   } else if (linear->weights != NULL) {
-     if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
-     else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
-     /* Only use SU biases on for integer matrices on SU archs. */
-#ifdef USE_SU_BIAS
-     bias = linear->subias;
-#endif
-   }
-   else OPUS_CLEAR(out, N);
-   if (bias != NULL) {
-      for (i=0;i<N;i++) out[i] += bias[i];
-   }
-   if (linear->diag) {
-      /* Diag is only used for GRU recurrent weights. */
-      celt_assert(3*M == N);
-      for (i=0;i<M;i++) {
-         out[i] += linear->diag[i]*in[i];
-         out[i+M] += linear->diag[i+M]*in[i];
-         out[i+2*M] += linear->diag[i+2*M]*in[i];
-      }
-   }
-}
-
-void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation)
-{
-   compute_linear(layer, output, input);
+   compute_linear(layer, output, input, arch);
   compute_activation(output, output, layer->nb_outputs, activation);
 }

 #define MAX_RNN_NEURONS_ALL IMAX(IMAX(FARGAN_MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS)


-void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in)
+void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch)
 {
  int i;
  int N;
@ -129,8 +95,8 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re
  h = &zrh[2*N];
  celt_assert(recurrent_weights->nb_outputs <= 3*MAX_RNN_NEURONS_ALL);
  celt_assert(in != state);
-  compute_linear(input_weights, zrh, in);
-  compute_linear(recurrent_weights, recur, state);
+  compute_linear(input_weights, zrh, in, arch);
+  compute_linear(recurrent_weights, recur, state, arch);
  for (i=0;i<2*N;i++)
     zrh[i] += recur[i];
  compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID);
@ -143,12 +109,12 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re
     state[i] = h[i];
 }

-void compute_glu(const LinearLayer *layer, float *output, const float *input)
+void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch)
 {
   int i;
   float act2[MAX_INPUTS];
   celt_assert(layer->nb_inputs == layer->nb_outputs);
-   compute_linear(layer, act2, input);
+   compute_linear(layer, act2, input, arch);
   compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID);
   if (input == output) {
     /* Give a vectorization hint to the compiler for the in-place case. */
@ -194,7 +160,7 @@ void compute_activation(float *output, const float *input, int N, int activation
   }
 }

-void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input)
+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch)
 {
   LinearLayer matrix;
   celt_assert(input != output);
@ -207,7 +173,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *
   matrix.nb_inputs = layer->nb_inputs;
   matrix.nb_outputs = layer->nb_neurons;
   matrix.scale = NULL;
-   compute_linear(&matrix, output, input);
+   compute_linear(&matrix, output, input, arch);
   compute_activation(output, output, layer->nb_neurons, layer->activation);
 }

@ -218,7 +184,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *
 #endif
 #define MAX_IDX_SIZE 8192

-void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input)
+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch)
 {
  LinearLayer in_matrix, rec_matrix;
  int i, M, N;
@ -262,25 +228,25 @@ void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *stat
  rec_matrix.float_weights = NULL;
 #endif
  rec_matrix.weights_idx = NULL;
-  compute_generic_gru(&in_matrix, &rec_matrix, state, input);
+  compute_generic_gru(&in_matrix, &rec_matrix, state, input, arch);
 }


 #define MAX_CONV_INPUTS_ALL DRED_MAX_CONV_INPUTS

-void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation)
+void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch)
 {
   float tmp[MAX_CONV_INPUTS_ALL];
   celt_assert(input != output);
   celt_assert(layer->nb_inputs <= MAX_CONV_INPUTS_ALL);
   OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);
   OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
-   compute_linear(layer, output, tmp);
+   compute_linear(layer, output, tmp, arch);
   compute_activation(output, output, layer->nb_outputs, activation);
   OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
 }

-void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation)
+void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch)
 {
   float tmp[MAX_CONV_INPUTS_ALL];
   int ksize = layer->nb_inputs/input_size;
@ -290,7 +256,7 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl
   if (dilation==1) OPUS_COPY(tmp, mem, layer->nb_inputs-input_size);
   else for (i=0;i<ksize-1;i++) OPUS_COPY(&tmp[i*input_size], &mem[i*input_size*dilation], input_size);
   OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size);
-   compute_linear(layer, output, tmp);
+   compute_linear(layer, output, tmp, arch);
   compute_activation(output, output, layer->nb_outputs, activation);
   if (dilation==1) OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size);
   else {
--- a/dnn/nnet.h
+++ b/dnn/nnet.h
@ -126,18 +126,18 @@ typedef struct {
  int dim;
 } EmbeddingLayer;

-void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation);
-void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in);
-void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation);
-void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation);
-void compute_glu(const LinearLayer *layer, float *output, const float *input);
-void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation);
+void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch);
+void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch);
+void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch);
+void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch);
+void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch);
+void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation, int arch);

 void compute_activation(float *output, const float *input, int N, int activation);

-void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input);
+void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch);

-void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input);
+void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch);



@ -189,4 +189,25 @@ int gru_init(GRULayer *layer, const WeightArray *arrays,
 void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation);


+
+void compute_linear_c(const LinearLayer *linear, float *out, const float *in);
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+#include "x86/dnn_x86.h"
+#endif
+
+#ifndef OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_c(linear, out, in))
+#endif
+
+#if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
+#if defined(_MSC_VER)
+#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
+#else
+#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
+#endif
+#endif
+
+
+
 #endif /* NNET_H_ */
--- a/dnn/nnet_arch.h
+++ b/dnn/nnet_arch.h
@ -0,0 +1,76 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef NNET_ARCH_H
+#define NNET_ARCH_H
+
+#include "nnet.h"
+#include "arch.h"
+#include "os_support.h"
+#include "vec.h"
+
+#define CAT_SUFFIX2(a,b) a ## b
+#define CAT_SUFFIX(a,b) CAT_SUFFIX2(a, b)
+
+#define RTCD_SUF(name) CAT_SUFFIX(name, RTCD_ARCH)
+
+void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const float *in)
+{
+   int i, M, N;
+   const float *bias;
+   celt_assert(in != out);
+   bias = linear->bias;
+   M = linear->nb_inputs;
+   N = linear->nb_outputs;
+   if (linear->float_weights != NULL) {
+     if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in);
+     else sgemv(out, linear->float_weights, N, M, N, in);
+   } else if (linear->weights != NULL) {
+     if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in);
+     else cgemv8x4(out, linear->weights, linear->scale, N, M, in);
+     /* Only use SU biases on for integer matrices on SU archs. */
+#ifdef USE_SU_BIAS
+     bias = linear->subias;
+#endif
+   }
+   else OPUS_CLEAR(out, N);
+   if (bias != NULL) {
+      for (i=0;i<N;i++) out[i] += bias[i];
+   }
+   if (linear->diag) {
+      /* Diag is only used for GRU recurrent weights. */
+      celt_assert(3*M == N);
+      for (i=0;i<M;i++) {
+         out[i] += linear->diag[i]*in[i];
+         out[i+M] += linear->diag[i+M]*in[i];
+         out[i+2*M] += linear->diag[i+2*M]*in[i];
+      }
+   }
+}
+
+
+#endif
--- a/dnn/nnet_default.c
+++ b/dnn/nnet_default.c
@ -0,0 +1,35 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+
+#define RTCD_ARCH c
+
+#include "nnet_arch.h"
--- a/dnn/pitchdnn.c
+++ b/dnn/pitchdnn.c
@ -12,7 +12,8 @@
 float compute_pitchdnn(
    PitchDNNState *st,
    const float *if_features,
-    const float *xcorr_features
+    const float *xcorr_features,
+    int arch
    )
 {
  float if1_out[DENSE_IF_UPSAMPLER_1_OUT_SIZE];
@ -28,16 +29,16 @@ float compute_pitchdnn(
  float count=0;
  PitchDNN *model = &st->model;
  /* IF */
-  compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH);
-  compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH);
+  compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH, arch);
+  compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH, arch);
  /* xcorr*/
  OPUS_COPY(&conv1_tmp1[1], xcorr_features, NB_XCORR_FEATURES);
  compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH);
  compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH);

-  compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH);
-  compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out);
-  compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR);
+  compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH, arch);
+  compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out, arch);
+  compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR, arch);
  for (i=0;i<180;i++) {
    if (output[i] > maxval) {
      pos = i;
@ -65,7 +66,6 @@ void pitchdnn_init(PitchDNNState *st)
  ret = 0;
 #endif
  celt_assert(ret == 0);
-  /* FIXME: perform arch detection. */
 }

 int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len) {
--- a/dnn/pitchdnn.h
+++ b/dnn/pitchdnn.h
@ -27,7 +27,8 @@ int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len);
 float compute_pitchdnn(
    PitchDNNState *st,
    const float *if_features,
-    const float *xcorr_features
+    const float *xcorr_features,
+    int arch
    );

 #endif
--- a/dnn/vec_avx.h
+++ b/dnn/vec_avx.h
@ -655,11 +655,6 @@ static inline mm256i_emu opus_mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a,
  return res;
 }

-#if defined(_MSC_VER)
-#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance")
-#else
-#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance"
-#endif

 #else

--- a/dnn/x86/dnn_x86.h
+++ b/dnn/x86/dnn_x86.h
@ -0,0 +1,78 @@
+/* Copyright (c) 2011-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifndef DNN_X86_H
+#define DNN_X86_H
+
+#include "cpu_support.h"
+#include "opus_types.h"
+
+#if defined(OPUS_X86_MAY_HAVE_SSE2)
+void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_SSE4_1)
+void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in);
+#endif
+
+#if defined(OPUS_X86_MAY_HAVE_AVX2)
+void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in);
+#endif
+
+
+#if defined(OPUS_X86_PRESUME_AVX2)
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in))
+
+#elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2)
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in))
+
+#elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1)
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in))
+
+#elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2))
+
+extern void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
+                    const LinearLayer *linear,
+                    float *out,
+                    const float *in
+                    );
+
+#define OVERRIDE_COMPUTE_LINEAR
+#define compute_linear(linear, out, in, arch) \
+    ((*DNN_COMPUTE_LINEAR_IMPL[(arch) & OPUS_ARCHMASK])(linear, out, in))
+
+#endif
+
+
+
+#endif /* DNN_X86_H */
--- a/dnn/x86/nnet_avx2.c
+++ b/dnn/x86/nnet_avx2.c
@ -0,0 +1,38 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __AVX2__
+#error nnet_avx2.c is being compiled without AVX2 enabled
+#endif
+
+#define RTCD_ARCH avx2
+
+#include "nnet_arch.h"
--- a/dnn/x86/nnet_sse2.c
+++ b/dnn/x86/nnet_sse2.c
@ -0,0 +1,38 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __SSE2__
+#error nnet_sse2.c is being compiled without SSE2 enabled
+#endif
+
+#define RTCD_ARCH sse2
+
+#include "nnet_arch.h"
--- a/dnn/x86/nnet_sse4_1.c
+++ b/dnn/x86/nnet_sse4_1.c
@ -0,0 +1,38 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#ifndef __SSE4_1__
+#error nnet_sse4_1.c is being compiled without SSE4.1 enabled
+#endif
+
+#define RTCD_ARCH sse4_1
+
+#include "nnet_arch.h"
--- a/dnn/x86/x86_dnn_map.c
+++ b/dnn/x86/x86_dnn_map.c
@ -0,0 +1,54 @@
+/* Copyright (c) 2018-2019 Mozilla
+                 2023 Amazon */
+/*
+   Redistribution and use in source and binary forms, with or without
+   modification, are permitted provided that the following conditions
+   are met:
+
+   - Redistributions of source code must retain the above copyright
+   notice, this list of conditions and the following disclaimer.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions and the following disclaimer in the
+   documentation and/or other materials provided with the distribution.
+
+   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+   A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+   CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
+   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "x86/x86cpu.h"
+#include "nnet.h"
+
+#if defined(OPUS_HAVE_RTCD)
+
+#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_AVX2))
+
+void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])(
+         const LinearLayer *linear,
+         float *out,
+         const float *in
+) = {
+  compute_linear_c,                /* non-sse */
+  compute_linear_c,
+  MAY_HAVE_SSE2(compute_linear),
+  MAY_HAVE_SSE4_1(compute_linear), /* sse4.1  */
+  MAY_HAVE_AVX2(compute_linear)  /* avx  */
+};
+
+#endif
+
+
+#endif
--- a/lpcnet_headers.mk
+++ b/lpcnet_headers.mk
@ -12,7 +12,9 @@ dnn/vec.h \
 dnn/vec_avx.h \
 dnn/vec_neon.h \
 dnn/pitchdnn.h \
-dnn/pitchdnn_data.h
+dnn/pitchdnn_data.h \
+dnn/x86/dnn_x86.h \
+dnn/nnet_arch.h

 DRED_HEAD = \
 silk/dred_coding.h \
--- a/lpcnet_sources.mk
+++ b/lpcnet_sources.mk
@ -7,6 +7,7 @@ dnn/lpcnet_enc.c \
 dnn/lpcnet_plc.c \
 dnn/lpcnet_tables.c \
 dnn/nnet.c \
+dnn/nnet_default.c \
 dnn/plc_data.c \
 dnn/parse_lpcnet_weights.c \
 dnn/pitchdnn.c \
@ -21,3 +22,8 @@ dnn/dred_rdovae_stats_data.c \
 silk/dred_encoder.c \
 silk/dred_coding.c \
 silk/dred_decoder.c
+
+DNN_SOURCES_X86_RTCD = dnn/x86/x86_dnn_map.c
+DNN_SOURCES_AVX2 = dnn/x86/nnet_avx2.c
+DNN_SOURCES_SSE4_1 = dnn/x86/nnet_sse4_1.c
+DNN_SOURCES_SSE2 = dnn/x86/nnet_sse2.c
--- a/silk/dred_encoder.c
+++ b/silk/dred_encoder.c
@ -87,7 +87,7 @@ void dred_encoder_init(DREDEnc* enc, opus_int32 Fs, int channels)
    dred_encoder_reset(enc);
 }

-static void dred_process_frame(DREDEnc *enc)
+static void dred_process_frame(DREDEnc *enc, int arch)
 {
    float feature_buffer[2 * 36];
    float input_buffer[2*DRED_NUM_FEATURES] = {0};
@ -97,15 +97,15 @@ static void dred_process_frame(DREDEnc *enc)
    OPUS_MOVE(enc->latents_buffer + DRED_LATENT_DIM, enc->latents_buffer, (DRED_MAX_FRAMES - 1) * DRED_LATENT_DIM);

    /* calculate LPCNet features */
-    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer);
-    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36);
+    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer, arch);
+    lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36, arch);

    /* prepare input buffer (discard LPC coefficients) */
    OPUS_COPY(input_buffer, feature_buffer, DRED_NUM_FEATURES);
    OPUS_COPY(input_buffer + DRED_NUM_FEATURES, feature_buffer + 36, DRED_NUM_FEATURES);

    /* run RDOVAE encoder */
-    dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer);
+    dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer, arch);
    enc->latents_buffer_fill = IMIN(enc->latents_buffer_fill+1, DRED_NUM_REDUNDANCY_FRAMES);
 }

@ -188,7 +188,7 @@ static void dred_convert_to_16k(DREDEnc *enc, const float *in, int in_len, float
    }
 }

-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay)
+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch)
 {
    int curr_offset16k;
    int frame_size16k = frame_size * 16000 / enc->Fs;
@ -206,7 +206,7 @@ void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int ex
        if (enc->input_buffer_fill >= 2*DRED_FRAME_SIZE)
        {
            curr_offset16k += 320;
-            dred_process_frame(enc);
+            dred_process_frame(enc, arch);
            enc->input_buffer_fill -= 2*DRED_FRAME_SIZE;
            OPUS_MOVE(&enc->input_buffer[0], &enc->input_buffer[2*DRED_FRAME_SIZE], enc->input_buffer_fill);
            /* 15 ms (6*2.5 ms) is the ideal offset for DRED because it corresponds to our vocoder look-ahead. */
--- a/silk/dred_encoder.h
+++ b/silk/dred_encoder.h
@ -64,7 +64,7 @@ void dred_encoder_reset(DREDEnc* enc);

 void dred_deinit_encoder(DREDEnc *enc);

-void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay);
+void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch);

 int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes);

--- a/src/opus_decoder.c
+++ b/src/opus_decoder.c
@ -1424,7 +1424,7 @@ int opus_dred_process(OpusDREDDecoder *dred_dec, const OpusDRED *src, OpusDRED *
      OPUS_COPY(dst, src, 1);
   if (dst->process_stage == 2)
      return OPUS_OK;
-   DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents);
+   DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents, dred_dec->arch);
   dst->process_stage = 2;
   return OPUS_OK;
 #else
--- a/src/opus_encoder.c
+++ b/src/opus_encoder.c
@ -1715,7 +1715,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_
 #ifdef ENABLE_DRED
    if ( st->dred_duration > 0 && st->dred_encoder.loaded ) {
        /* DRED Encoder */
-        dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer );
+        dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer, st->arch );
    } else {
        st->dred_encoder.latents_buffer_fill = 0;
    }