diff --git a/Makefile.am b/Makefile.am index 452f6d22..046d3069 100644 --- a/Makefile.am +++ b/Makefile.am @@ -50,18 +50,30 @@ if CPU_X86 if HAVE_RTCD CELT_SOURCES += $(CELT_SOURCES_X86_RTCD) SILK_SOURCES += $(SILK_SOURCES_X86_RTCD) +if ENABLE_DEEP_PLC +LPCNET_SOURCES += $(DNN_SOURCES_X86_RTCD) +endif endif if HAVE_SSE CELT_SOURCES += $(CELT_SOURCES_SSE) endif if HAVE_SSE2 CELT_SOURCES += $(CELT_SOURCES_SSE2) +if ENABLE_DEEP_PLC +LPCNET_SOURCES += $(DNN_SOURCES_SSE2) +endif endif if HAVE_SSE4_1 CELT_SOURCES += $(CELT_SOURCES_SSE4_1) +if ENABLE_DEEP_PLC +LPCNET_SOURCES += $(DNN_SOURCES_SSE4_1) +endif endif if HAVE_AVX2 CELT_SOURCES += $(CELT_SOURCES_AVX2) +if ENABLE_DEEP_PLC +LPCNET_SOURCES += $(DNN_SOURCES_AVX2) +endif endif endif @@ -398,19 +410,22 @@ $(SSE_OBJ): CFLAGS += $(OPUS_X86_SSE_CFLAGS) endif if HAVE_SSE2 -SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) +SSE2_OBJ = $(CELT_SOURCES_SSE2:.c=.lo) \ + $(DNN_SOURCES_SSE2:.c=.lo) $(SSE2_OBJ): CFLAGS += $(OPUS_X86_SSE2_CFLAGS) endif if HAVE_SSE4_1 SSE4_1_OBJ = $(CELT_SOURCES_SSE4_1:.c=.lo) \ + $(DNN_SOURCES_SSE4_1:.c=.lo) \ $(SILK_SOURCES_SSE4_1:.c=.lo) \ $(SILK_SOURCES_FIXED_SSE4_1:.c=.lo) $(SSE4_1_OBJ): CFLAGS += $(OPUS_X86_SSE4_1_CFLAGS) endif if HAVE_AVX2 -AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) +AVX2_OBJ = $(CELT_SOURCES_AVX2:.c=.lo) \ + $(DNN_SOURCES_AVX2:.c=.lo) $(AVX2_OBJ): CFLAGS += $(OPUS_X86_AVX2_CFLAGS) endif diff --git a/celt/x86/x86cpu.h b/celt/x86/x86cpu.h index fe46d1d9..6ce10e60 100644 --- a/celt/x86/x86cpu.h +++ b/celt/x86/x86cpu.h @@ -47,7 +47,7 @@ # endif # if defined(OPUS_X86_MAY_HAVE_AVX2) -# define MAY_HAVE_AVX2(name) name ## _avx +# define MAY_HAVE_AVX2(name) name ## _avx2 # else # define MAY_HAVE_AVX2(name) name ## _c # endif diff --git a/dnn/dred_rdovae_dec.c b/dnn/dred_rdovae_dec.c index e2b19b14..7797ee77 100644 --- a/dnn/dred_rdovae_dec.c +++ b/dnn/dred_rdovae_dec.c @@ -42,33 +42,35 @@ static void conv1_cond_init(float *mem, int len, int dilation, int *init) *init = 1; } -void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents) +void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch) { int i; RDOVAEDecState dec; memset(&dec, 0, sizeof(dec)); - dred_rdovae_dec_init_states(&dec, model, state); + dred_rdovae_dec_init_states(&dec, model, state, arch); for (i = 0; i < 2*nb_latents; i += 2) { dred_rdovae_decode_qframe( &dec, model, &features[2*i*DRED_NUM_FEATURES], - &latents[(i/2)*DRED_LATENT_DIM]); + &latents[(i/2)*DRED_LATENT_DIM], + arch); } } void dred_rdovae_dec_init_states( RDOVAEDecState *h, /* io: state buffer handle */ const RDOVAEDec *model, - const float *initial_state /* i: initial state */ + const float *initial_state, /* i: initial state */ + int arch ) { float hidden[DEC_HIDDEN_INIT_OUT_SIZE]; float state_init[DEC_GRU1_STATE_SIZE+DEC_GRU2_STATE_SIZE+DEC_GRU3_STATE_SIZE+DEC_GRU4_STATE_SIZE+DEC_GRU5_STATE_SIZE]; int counter=0; - compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH); - compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH); + compute_generic_dense(&model->dec_hidden_init, hidden, initial_state, ACTIVATION_TANH, arch); + compute_generic_dense(&model->dec_gru_init, state_init, hidden, ACTIVATION_TANH, arch); OPUS_COPY(h->gru1_state, state_init, DEC_GRU1_STATE_SIZE); counter += DEC_GRU1_STATE_SIZE; OPUS_COPY(h->gru2_state, &state_init[counter], DEC_GRU2_STATE_SIZE); @@ -86,7 +88,8 @@ void dred_rdovae_decode_qframe( RDOVAEDecState *dec_state, /* io: state buffer handle */ const RDOVAEDec *model, float *qframe, /* o: quadruple feature frame (four concatenated frames in reverse order) */ - const float *input /* i: latent vector */ + const float *input, /* i: latent vector */ + int arch ) { float buffer[DEC_DENSE1_OUT_SIZE + DEC_GRU1_OUT_SIZE + DEC_GRU2_OUT_SIZE + DEC_GRU3_OUT_SIZE + DEC_GRU4_OUT_SIZE + DEC_GRU5_OUT_SIZE @@ -94,43 +97,43 @@ void dred_rdovae_decode_qframe( int output_index = 0; /* run encoder stack and concatenate output in buffer*/ - compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH); + compute_generic_dense(&model->dec_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch); output_index += DEC_DENSE1_OUT_SIZE; - compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer); - compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state); + compute_generic_gru(&model->dec_gru1_input, &model->dec_gru1_recurrent, dec_state->gru1_state, buffer, arch); + compute_glu(&model->dec_glu1, &buffer[output_index], dec_state->gru1_state, arch); output_index += DEC_GRU1_OUT_SIZE; conv1_cond_init(dec_state->conv1_state, output_index, 1, &dec_state->initialized); - compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH); + compute_generic_conv1d(&model->dec_conv1, &buffer[output_index], dec_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch); output_index += DEC_CONV1_OUT_SIZE; - compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer); - compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state); + compute_generic_gru(&model->dec_gru2_input, &model->dec_gru2_recurrent, dec_state->gru2_state, buffer, arch); + compute_glu(&model->dec_glu2, &buffer[output_index], dec_state->gru2_state, arch); output_index += DEC_GRU2_OUT_SIZE; conv1_cond_init(dec_state->conv2_state, output_index, 1, &dec_state->initialized); - compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH); + compute_generic_conv1d(&model->dec_conv2, &buffer[output_index], dec_state->conv2_state, buffer, output_index, ACTIVATION_TANH, arch); output_index += DEC_CONV2_OUT_SIZE; - compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer); - compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state); + compute_generic_gru(&model->dec_gru3_input, &model->dec_gru3_recurrent, dec_state->gru3_state, buffer, arch); + compute_glu(&model->dec_glu3, &buffer[output_index], dec_state->gru3_state, arch); output_index += DEC_GRU3_OUT_SIZE; conv1_cond_init(dec_state->conv3_state, output_index, 1, &dec_state->initialized); - compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH); + compute_generic_conv1d(&model->dec_conv3, &buffer[output_index], dec_state->conv3_state, buffer, output_index, ACTIVATION_TANH, arch); output_index += DEC_CONV3_OUT_SIZE; - compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer); - compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state); + compute_generic_gru(&model->dec_gru4_input, &model->dec_gru4_recurrent, dec_state->gru4_state, buffer, arch); + compute_glu(&model->dec_glu4, &buffer[output_index], dec_state->gru4_state, arch); output_index += DEC_GRU4_OUT_SIZE; conv1_cond_init(dec_state->conv4_state, output_index, 1, &dec_state->initialized); - compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH); + compute_generic_conv1d(&model->dec_conv4, &buffer[output_index], dec_state->conv4_state, buffer, output_index, ACTIVATION_TANH, arch); output_index += DEC_CONV4_OUT_SIZE; - compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer); - compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state); + compute_generic_gru(&model->dec_gru5_input, &model->dec_gru5_recurrent, dec_state->gru5_state, buffer, arch); + compute_glu(&model->dec_glu5, &buffer[output_index], dec_state->gru5_state, arch); output_index += DEC_GRU5_OUT_SIZE; conv1_cond_init(dec_state->conv5_state, output_index, 1, &dec_state->initialized); - compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH); + compute_generic_conv1d(&model->dec_conv5, &buffer[output_index], dec_state->conv5_state, buffer, output_index, ACTIVATION_TANH, arch); output_index += DEC_CONV5_OUT_SIZE; - compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR); + compute_generic_dense(&model->dec_output, qframe, buffer, ACTIVATION_LINEAR, arch); } diff --git a/dnn/dred_rdovae_dec.h b/dnn/dred_rdovae_dec.h index 636f0ee0..4e66911c 100644 --- a/dnn/dred_rdovae_dec.h +++ b/dnn/dred_rdovae_dec.h @@ -46,8 +46,8 @@ struct RDOVAEDecStruct { float conv5_state[DEC_CONV5_STATE_SIZE]; }; -void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state); -void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z); -void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents); +void dred_rdovae_dec_init_states(RDOVAEDecState *h, const RDOVAEDec *model, const float * initial_state, int arch); +void dred_rdovae_decode_qframe(RDOVAEDecState *h, const RDOVAEDec *model, float *qframe, const float * z, int arch); +void DRED_rdovae_decode_all(const RDOVAEDec *model, float *features, const float *state, const float *latents, int nb_latents, int arch); #endif diff --git a/dnn/dred_rdovae_enc.c b/dnn/dred_rdovae_enc.c index e159e632..4f13ae21 100644 --- a/dnn/dred_rdovae_enc.c +++ b/dnn/dred_rdovae_enc.c @@ -50,7 +50,8 @@ void dred_rdovae_encode_dframe( const RDOVAEEnc *model, float *latents, /* o: latent vector */ float *initial_state, /* o: initial state */ - const float *input /* i: double feature frame (concatenated) */ + const float *input, /* i: double feature frame (concatenated) */ + int arch ) { float padded_latents[DRED_PADDED_LATENT_DIM]; @@ -61,49 +62,49 @@ void dred_rdovae_encode_dframe( int output_index = 0; /* run encoder stack and concatenate output in buffer*/ - compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH); + compute_generic_dense(&model->enc_dense1, &buffer[output_index], input, ACTIVATION_TANH, arch); output_index += ENC_DENSE1_OUT_SIZE; - compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer); + compute_generic_gru(&model->enc_gru1_input, &model->enc_gru1_recurrent, enc_state->gru1_state, buffer, arch); OPUS_COPY(&buffer[output_index], enc_state->gru1_state, ENC_GRU1_OUT_SIZE); output_index += ENC_GRU1_OUT_SIZE; conv1_cond_init(enc_state->conv1_state, output_index, 1, &enc_state->initialized); - compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH); + compute_generic_conv1d(&model->enc_conv1, &buffer[output_index], enc_state->conv1_state, buffer, output_index, ACTIVATION_TANH, arch); output_index += ENC_CONV1_OUT_SIZE; - compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer); + compute_generic_gru(&model->enc_gru2_input, &model->enc_gru2_recurrent, enc_state->gru2_state, buffer, arch); OPUS_COPY(&buffer[output_index], enc_state->gru2_state, ENC_GRU2_OUT_SIZE); output_index += ENC_GRU2_OUT_SIZE; conv1_cond_init(enc_state->conv2_state, output_index, 2, &enc_state->initialized); - compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH); + compute_generic_conv1d_dilation(&model->enc_conv2, &buffer[output_index], enc_state->conv2_state, buffer, output_index, 2, ACTIVATION_TANH, arch); output_index += ENC_CONV2_OUT_SIZE; - compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer); + compute_generic_gru(&model->enc_gru3_input, &model->enc_gru3_recurrent, enc_state->gru3_state, buffer, arch); OPUS_COPY(&buffer[output_index], enc_state->gru3_state, ENC_GRU3_OUT_SIZE); output_index += ENC_GRU3_OUT_SIZE; conv1_cond_init(enc_state->conv3_state, output_index, 2, &enc_state->initialized); - compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH); + compute_generic_conv1d_dilation(&model->enc_conv3, &buffer[output_index], enc_state->conv3_state, buffer, output_index, 2, ACTIVATION_TANH, arch); output_index += ENC_CONV3_OUT_SIZE; - compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer); + compute_generic_gru(&model->enc_gru4_input, &model->enc_gru4_recurrent, enc_state->gru4_state, buffer, arch); OPUS_COPY(&buffer[output_index], enc_state->gru4_state, ENC_GRU4_OUT_SIZE); output_index += ENC_GRU4_OUT_SIZE; conv1_cond_init(enc_state->conv4_state, output_index, 2, &enc_state->initialized); - compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH); + compute_generic_conv1d_dilation(&model->enc_conv4, &buffer[output_index], enc_state->conv4_state, buffer, output_index, 2, ACTIVATION_TANH, arch); output_index += ENC_CONV4_OUT_SIZE; - compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer); + compute_generic_gru(&model->enc_gru5_input, &model->enc_gru5_recurrent, enc_state->gru5_state, buffer, arch); OPUS_COPY(&buffer[output_index], enc_state->gru5_state, ENC_GRU5_OUT_SIZE); output_index += ENC_GRU5_OUT_SIZE; conv1_cond_init(enc_state->conv5_state, output_index, 2, &enc_state->initialized); - compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH); + compute_generic_conv1d_dilation(&model->enc_conv5, &buffer[output_index], enc_state->conv5_state, buffer, output_index, 2, ACTIVATION_TANH, arch); output_index += ENC_CONV5_OUT_SIZE; - compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR); + compute_generic_dense(&model->enc_zdense, padded_latents, buffer, ACTIVATION_LINEAR, arch); OPUS_COPY(latents, padded_latents, DRED_LATENT_DIM); /* next, calculate initial state */ - compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH); - compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR); + compute_generic_dense(&model->gdense1, state_hidden, buffer, ACTIVATION_TANH, arch); + compute_generic_dense(&model->gdense2, padded_state, state_hidden, ACTIVATION_LINEAR, arch); OPUS_COPY(initial_state, padded_state, DRED_STATE_DIM); } diff --git a/dnn/dred_rdovae_enc.h b/dnn/dred_rdovae_enc.h index 43a4e8b2..6fe537ee 100644 --- a/dnn/dred_rdovae_enc.h +++ b/dnn/dred_rdovae_enc.h @@ -46,7 +46,7 @@ struct RDOVAEEncStruct { float conv5_state[2*ENC_CONV5_STATE_SIZE]; }; -void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input); +void dred_rdovae_encode_dframe(RDOVAEEncState *enc_state, const RDOVAEEnc *model, float *latents, float *initial_state, const float *input, int arch); #endif diff --git a/dnn/dump_data.c b/dnn/dump_data.c index be1ff16e..e7acfb11 100644 --- a/dnn/dump_data.c +++ b/dnn/dump_data.c @@ -42,6 +42,7 @@ #include "lpcnet.h" #include "lpcnet_private.h" #include "os_support.h" +#include "cpu_support.h" static void biquad(float *y, float mem[2], const float *x, const float *b, const float *a, int N) { @@ -135,7 +136,9 @@ int main(int argc, char **argv) { FILE *fnoise = NULL; float noise_gain = 0; long noise_size=0; + int arch; srand(getpid()); + arch = opus_select_arch(); st = lpcnet_encoder_create(); argv0=argv[0]; if (argc == 5 && strcmp(argv[1], "-btrain")==0) { @@ -244,7 +247,7 @@ int main(int argc, char **argv) { for (i=0;icond_net_pembed.float_weights[IMAX(0,IMIN(period-32, 224))*COND_NET_PEMBED_OUT_SIZE], COND_NET_PEMBED_OUT_SIZE); OPUS_COPY(dense_in, features, NB_FEATURES); - compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH); - compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH); - compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH); + compute_generic_dense(&model->cond_net_fdense1, conv1_in, dense_in, ACTIVATION_TANH, st->arch); + compute_generic_conv1d(&model->cond_net_fconv1, conv2_in, st->cond_conv1_state, conv1_in, COND_NET_FCONV1_IN_SIZE, ACTIVATION_TANH, st->arch); + compute_generic_conv1d(&model->cond_net_fconv2, cond, st->cond_conv2_state, conv2_in, COND_NET_FCONV2_IN_SIZE, ACTIVATION_TANH, st->arch); } static void fargan_deemphasis(float *pcm, float *deemph_mem) { @@ -84,7 +85,7 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond, celt_assert(st->cont_initialized); model = &st->model; - compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR); + compute_generic_dense(&model->sig_net_cond_gain_dense, &gain, cond, ACTIVATION_LINEAR, st->arch); gain = exp(gain); gain_1 = 1.f/(1e-5f + gain); @@ -100,26 +101,26 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond, OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE], pred, FARGAN_SUBFRAME_SIZE+4); OPUS_COPY(&fwc0_in[FARGAN_COND_SIZE+FARGAN_SUBFRAME_SIZE+4], prev, FARGAN_SUBFRAME_SIZE); - compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH); + compute_generic_conv1d(&model->sig_net_fwc0_conv, gru1_in, st->fwc0_mem, fwc0_in, SIG_NET_INPUT_SIZE, ACTIVATION_TANH, st->arch); celt_assert(SIG_NET_FWC0_GLU_GATE_OUT_SIZE == model->sig_net_fwc0_glu_gate.nb_outputs); - compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in); + compute_glu(&model->sig_net_fwc0_glu_gate, gru1_in, gru1_in, st->arch); - compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID); + compute_generic_dense(&model->sig_net_gain_dense_out, pitch_gate, gru1_in, ACTIVATION_SIGMOID, st->arch); for (i=0;isig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in); - compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state); + compute_generic_gru(&model->sig_net_gru1_input, &model->sig_net_gru1_recurrent, st->gru1_state, gru1_in, st->arch); + compute_glu(&model->sig_net_gru1_glu_gate, gru2_in, st->gru1_state, st->arch); for (i=0;isig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in); - compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state); + compute_generic_gru(&model->sig_net_gru2_input, &model->sig_net_gru2_recurrent, st->gru2_state, gru2_in, st->arch); + compute_glu(&model->sig_net_gru2_glu_gate, gru3_in, st->gru2_state, st->arch); for (i=0;isig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in); - compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state); + compute_generic_gru(&model->sig_net_gru3_input, &model->sig_net_gru3_recurrent, st->gru3_state, gru3_in, st->arch); + compute_glu(&model->sig_net_gru3_glu_gate, &skip_cat[SIG_NET_GRU1_OUT_SIZE+SIG_NET_GRU2_OUT_SIZE], st->gru3_state, st->arch); OPUS_COPY(skip_cat, gru2_in, SIG_NET_GRU1_OUT_SIZE); OPUS_COPY(&skip_cat[SIG_NET_GRU1_OUT_SIZE], gru3_in, SIG_NET_GRU2_OUT_SIZE); @@ -127,10 +128,10 @@ static void run_fargan_subframe(FARGANState *st, float *pcm, const float *cond, for (i=0;isig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH); - compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out); + compute_generic_dense(&model->sig_net_skip_dense, skip_out, skip_cat, ACTIVATION_TANH, st->arch); + compute_glu(&model->sig_net_skip_glu_gate, skip_out, skip_out, st->arch); - compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH); + compute_generic_dense(&model->sig_net_sig_dense_out, pcm, skip_out, ACTIVATION_TANH, st->arch); for (i=0;ipitch_buf, &st->pitch_buf[FARGAN_SUBFRAME_SIZE], PITCH_MAX_PERIOD-FARGAN_SUBFRAME_SIZE); @@ -174,13 +175,13 @@ void fargan_init(FARGANState *st) { int ret; OPUS_CLEAR(st, 1); + st->arch = opus_select_arch(); #ifndef USE_WEIGHTS_FILE ret = init_fargan(&st->model, fargan_arrays); #else ret = 0; #endif celt_assert(ret == 0); - /* FIXME: perform arch detection. */ } int fargan_load_model(FARGANState *st, const unsigned char *data, int len) { diff --git a/dnn/lpcnet.h b/dnn/lpcnet.h index adcba515..ec39dc24 100644 --- a/dnn/lpcnet.h +++ b/dnn/lpcnet.h @@ -120,7 +120,7 @@ int lpcnet_encode(LPCNetEncState *st, const opus_int16 *pcm, unsigned char *buf) * @param [out] features float[NB_TOTAL_FEATURES]: Four feature vectors * @retval 0 Success */ -int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]); +int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch); /** Compute features on LPCNET_FRAME_SIZE speech samples (currently 160) and output features for one 10-ms frame. @@ -129,7 +129,7 @@ int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *p * @param [out] features float[NB_TOTAL_FEATURES]: Four feature vectors * @retval 0 Success */ -int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES]); +int lpcnet_compute_single_frame_features_float(LPCNetEncState *st, const float *pcm, float features[NB_TOTAL_FEATURES], int arch); /** Gets the size of an LPCNetState structure. * @returns The size in bytes. diff --git a/dnn/lpcnet_demo.c b/dnn/lpcnet_demo.c index cfa9f6fd..aad31190 100644 --- a/dnn/lpcnet_demo.c +++ b/dnn/lpcnet_demo.c @@ -37,6 +37,7 @@ #include "freq.h" #include "os_support.h" #include "fargan.h" +#include "cpu_support.h" #ifdef USE_WEIGHTS_FILE # if __unix__ @@ -99,12 +100,14 @@ void usage(void) { int main(int argc, char **argv) { int mode=0; + int arch; FILE *fin, *fout; #ifdef USE_WEIGHTS_FILE int len; unsigned char *data; const char *filename = "weights_blob.bin"; #endif + arch = opus_select_arch(); if (argc < 4) usage(); if (strcmp(argv[1], "-features") == 0) mode=MODE_FEATURES; else if (strcmp(argv[1], "-fargan-synthesis") == 0) mode=MODE_FARGAN_SYNTHESIS; @@ -137,7 +140,7 @@ int main(int argc, char **argv) { size_t ret; ret = fread(pcm, sizeof(pcm[0]), LPCNET_FRAME_SIZE, fin); if (feof(fin) || ret != LPCNET_FRAME_SIZE) break; - lpcnet_compute_single_frame_features(net, pcm, features); + lpcnet_compute_single_frame_features(net, pcm, features, arch); fwrite(features, sizeof(float), NB_TOTAL_FEATURES, fout); } lpcnet_encoder_destroy(net); diff --git a/dnn/lpcnet_enc.c b/dnn/lpcnet_enc.c index c2c5578b..8e3164df 100644 --- a/dnn/lpcnet_enc.c +++ b/dnn/lpcnet_enc.c @@ -95,7 +95,7 @@ static void biquad(float *y, float mem[2], const float *x, const float *b, const #define celt_log10(x) (0.3010299957f*celt_log2(x)) -void compute_frame_features(LPCNetEncState *st, const float *in) { +void compute_frame_features(LPCNetEncState *st, const float *in, int arch) { float aligned_in[FRAME_SIZE]; int i; float Ly[NB_BANDS]; @@ -142,7 +142,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) { OPUS_COPY(&x[0], st->pitch_mem, LPC_ORDER); OPUS_COPY(&x[LPC_ORDER], aligned_in, FRAME_SIZE); OPUS_COPY(st->pitch_mem, &aligned_in[FRAME_SIZE-LPC_ORDER], LPC_ORDER); - celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, st->arch); + celt_fir(&x[LPC_ORDER], st->lpc, &st->lp_buf[PITCH_MAX_PERIOD], FRAME_SIZE, LPC_ORDER, arch); for (i=0;iexc_buf[PITCH_MAX_PERIOD+i] = st->lp_buf[PITCH_MAX_PERIOD+i] + .7f*st->pitch_filt; st->pitch_filt = st->lp_buf[PITCH_MAX_PERIOD+i]; @@ -152,7 +152,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) { { double ener1; float *buf = st->exc_buf; - celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, st->arch); + celt_pitch_xcorr(&buf[PITCH_MAX_PERIOD], buf, xcorr, FRAME_SIZE, PITCH_MAX_PERIOD-PITCH_MIN_PERIOD, arch); ener0 = celt_inner_prod_c(&buf[PITCH_MAX_PERIOD], &buf[PITCH_MAX_PERIOD], FRAME_SIZE); ener1 = celt_inner_prod_c(&buf[0], &buf[0], FRAME_SIZE-1); /*printf("%f\n", st->frame_weight[sub]);*/ @@ -165,7 +165,7 @@ void compute_frame_features(LPCNetEncState *st, const float *in) { } /*printf("\n");*/ } - st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features); + st->dnn_pitch = compute_pitchdnn(&st->pitchdnn, st->if_features, st->xcorr_features, arch); } void process_single_frame(LPCNetEncState *st, FILE *ffeat) { @@ -196,26 +196,26 @@ void preemphasis(float *y, float *mem, const float *x, float coef, int N) { } } -static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES]) { +static int lpcnet_compute_single_frame_features_impl(LPCNetEncState *st, float *x, float features[NB_TOTAL_FEATURES], int arch) { preemphasis(x, &st->mem_preemph, x, PREEMPHASIS, FRAME_SIZE); - compute_frame_features(st, x); + compute_frame_features(st, x, arch); process_single_frame(st, NULL); OPUS_COPY(features, &st->features[0], NB_TOTAL_FEATURES); return 0; } -int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]) { +int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES], int arch) { int i; float x[FRAME_SIZE]; for (i=0;iarch = opus_select_arch(); fargan_init(&st->fargan); lpcnet_encoder_init(&st->enc); st->analysis_pos = PLC_BUF_SIZE; @@ -109,10 +111,10 @@ static void compute_plc_pred(LPCNetPLCState *st, float *out, const float *in) { float dense_out[PLC_DENSE1_OUT_SIZE]; PLCNetState *net = &st->plc_net; celt_assert(st->loaded); - _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in); - compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out); - compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state); - _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state); + _lpcnet_compute_dense(&st->model.plc_dense1, dense_out, in, st->arch); + compute_gruB(&st->model.plc_gru1, zeros, net->plc_gru1_state, dense_out, st->arch); + compute_gruB(&st->model.plc_gru2, zeros, net->plc_gru2_state, net->plc_gru1_state, st->arch); + _lpcnet_compute_dense(&st->model.plc_out, out, net->plc_gru2_state, st->arch); } static int get_fec_or_pred(LPCNetPLCState *st, float *out) { @@ -164,7 +166,7 @@ int lpcnet_plc_conceal(LPCNetPLCState *st, opus_int16 *pcm) { float plc_features[2*NB_BANDS+NB_FEATURES+1]; for (i=0;ipcm[st->analysis_pos+i]; burg_cepstral_analysis(plc_features, x); - lpcnet_compute_single_frame_features_float(&st->enc, x, st->features); + lpcnet_compute_single_frame_features_float(&st->enc, x, st->features, st->arch); if ((st->analysis_gap && count > 0) || count > 1) { queue_features(st, st->features); OPUS_COPY(&plc_features[2*NB_BANDS], st->features, NB_FEATURES); diff --git a/dnn/lpcnet_private.h b/dnn/lpcnet_private.h index 30931b1d..9a68c718 100644 --- a/dnn/lpcnet_private.h +++ b/dnn/lpcnet_private.h @@ -24,7 +24,6 @@ struct LPCNetEncState{ PitchDNNState pitchdnn; - int arch; float analysis_mem[OVERLAP_SIZE]; float mem_preemph; kiss_fft_cpx prev_if[PITCH_IF_MAX_FREQ]; @@ -67,7 +66,7 @@ struct LPCNetPLCState { void preemphasis(float *y, float *mem, const float *x, float coef, int N); -void compute_frame_features(LPCNetEncState *st, const float *in); +void compute_frame_features(LPCNetEncState *st, const float *in, int arch); void lpcnet_reset_signal(LPCNetState *lpcnet); void run_frame_network(LPCNetState *lpcnet, float *gru_a_condition, float *gru_b_condition, float *lpc, const float *features); @@ -79,7 +78,6 @@ void lpcnet_synthesize_tail_impl(LPCNetState *lpcnet, opus_int16 *output, int N, void lpcnet_synthesize_impl(LPCNetState *lpcnet, const float *features, opus_int16 *output, int N, int preload); void lpcnet_synthesize_blend_impl(LPCNetState *lpcnet, const opus_int16 *pcm_in, opus_int16 *output, int N); void process_single_frame(LPCNetEncState *st, FILE *ffeat); -int lpcnet_compute_single_frame_features(LPCNetEncState *st, const opus_int16 *pcm, float features[NB_TOTAL_FEATURES]); void process_single_frame(LPCNetEncState *st, FILE *ffeat); diff --git a/dnn/nnet.c b/dnn/nnet.c index c76e9f28..22fda89b 100644 --- a/dnn/nnet.c +++ b/dnn/nnet.c @@ -69,50 +69,16 @@ static OPUS_INLINE float relu(float x) return x < 0 ? 0 : x; } -static void compute_linear(const LinearLayer *linear, float *out, const float *in) +void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch) { - int i, M, N; - const float *bias; - celt_assert(in != out); - bias = linear->bias; - M = linear->nb_inputs; - N = linear->nb_outputs; - if (linear->float_weights != NULL) { - if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in); - else sgemv(out, linear->float_weights, N, M, N, in); - } else if (linear->weights != NULL) { - if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in); - else cgemv8x4(out, linear->weights, linear->scale, N, M, in); - /* Only use SU biases on for integer matrices on SU archs. */ -#ifdef USE_SU_BIAS - bias = linear->subias; -#endif - } - else OPUS_CLEAR(out, N); - if (bias != NULL) { - for (i=0;idiag) { - /* Diag is only used for GRU recurrent weights. */ - celt_assert(3*M == N); - for (i=0;idiag[i]*in[i]; - out[i+M] += linear->diag[i+M]*in[i]; - out[i+2*M] += linear->diag[i+2*M]*in[i]; - } - } -} - -void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation) -{ - compute_linear(layer, output, input); + compute_linear(layer, output, input, arch); compute_activation(output, output, layer->nb_outputs, activation); } #define MAX_RNN_NEURONS_ALL IMAX(IMAX(FARGAN_MAX_RNN_NEURONS, PLC_MAX_RNN_NEURONS), DRED_MAX_RNN_NEURONS) -void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in) +void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch) { int i; int N; @@ -129,8 +95,8 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re h = &zrh[2*N]; celt_assert(recurrent_weights->nb_outputs <= 3*MAX_RNN_NEURONS_ALL); celt_assert(in != state); - compute_linear(input_weights, zrh, in); - compute_linear(recurrent_weights, recur, state); + compute_linear(input_weights, zrh, in, arch); + compute_linear(recurrent_weights, recur, state, arch); for (i=0;i<2*N;i++) zrh[i] += recur[i]; compute_activation(zrh, zrh, 2*N, ACTIVATION_SIGMOID); @@ -143,12 +109,12 @@ void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *re state[i] = h[i]; } -void compute_glu(const LinearLayer *layer, float *output, const float *input) +void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch) { int i; float act2[MAX_INPUTS]; celt_assert(layer->nb_inputs == layer->nb_outputs); - compute_linear(layer, act2, input); + compute_linear(layer, act2, input, arch); compute_activation(act2, act2, layer->nb_outputs, ACTIVATION_SIGMOID); if (input == output) { /* Give a vectorization hint to the compiler for the in-place case. */ @@ -194,7 +160,7 @@ void compute_activation(float *output, const float *input, int N, int activation } } -void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input) +void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch) { LinearLayer matrix; celt_assert(input != output); @@ -207,7 +173,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float * matrix.nb_inputs = layer->nb_inputs; matrix.nb_outputs = layer->nb_neurons; matrix.scale = NULL; - compute_linear(&matrix, output, input); + compute_linear(&matrix, output, input, arch); compute_activation(output, output, layer->nb_neurons, layer->activation); } @@ -218,7 +184,7 @@ void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float * #endif #define MAX_IDX_SIZE 8192 -void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input) +void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch) { LinearLayer in_matrix, rec_matrix; int i, M, N; @@ -262,25 +228,25 @@ void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *stat rec_matrix.float_weights = NULL; #endif rec_matrix.weights_idx = NULL; - compute_generic_gru(&in_matrix, &rec_matrix, state, input); + compute_generic_gru(&in_matrix, &rec_matrix, state, input, arch); } #define MAX_CONV_INPUTS_ALL DRED_MAX_CONV_INPUTS -void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation) +void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch) { float tmp[MAX_CONV_INPUTS_ALL]; celt_assert(input != output); celt_assert(layer->nb_inputs <= MAX_CONV_INPUTS_ALL); OPUS_COPY(tmp, mem, layer->nb_inputs-input_size); OPUS_COPY(&tmp[layer->nb_inputs-input_size], input, input_size); - compute_linear(layer, output, tmp); + compute_linear(layer, output, tmp, arch); compute_activation(output, output, layer->nb_outputs, activation); OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size); } -void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation) +void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch) { float tmp[MAX_CONV_INPUTS_ALL]; int ksize = layer->nb_inputs/input_size; @@ -290,7 +256,7 @@ void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, fl if (dilation==1) OPUS_COPY(tmp, mem, layer->nb_inputs-input_size); else for (i=0;inb_inputs-input_size], input, input_size); - compute_linear(layer, output, tmp); + compute_linear(layer, output, tmp, arch); compute_activation(output, output, layer->nb_outputs, activation); if (dilation==1) OPUS_COPY(mem, &tmp[input_size], layer->nb_inputs-input_size); else { diff --git a/dnn/nnet.h b/dnn/nnet.h index 64b59d66..c8240ffc 100644 --- a/dnn/nnet.h +++ b/dnn/nnet.h @@ -126,18 +126,18 @@ typedef struct { int dim; } EmbeddingLayer; -void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation); -void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in); -void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation); -void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation); -void compute_glu(const LinearLayer *layer, float *output, const float *input); -void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation); +void compute_generic_dense(const LinearLayer *layer, float *output, const float *input, int activation, int arch); +void compute_generic_gru(const LinearLayer *input_weights, const LinearLayer *recurrent_weights, float *state, const float *in, int arch); +void compute_generic_conv1d(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int activation, int arch); +void compute_generic_conv1d_dilation(const LinearLayer *layer, float *output, float *mem, const float *input, int input_size, int dilation, int activation, int arch); +void compute_glu(const LinearLayer *layer, float *output, const float *input, int arch); +void compute_gated_activation(const LinearLayer *layer, float *output, const float *input, int activation, int arch); void compute_activation(float *output, const float *input, int N, int activation); -void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input); +void _lpcnet_compute_dense(const DenseLayer *layer, float *output, const float *input, int arch); -void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input); +void compute_gruB(const GRULayer *gru, const float* gru_b_condition, float *state, const float *input, int arch); @@ -189,4 +189,25 @@ int gru_init(GRULayer *layer, const WeightArray *arrays, void compute_conv2d(const Conv2dLayer *conv, float *out, float *mem, const float *in, int height, int hstride, int activation); + +void compute_linear_c(const LinearLayer *linear, float *out, const float *in); + +#if defined(OPUS_X86_MAY_HAVE_SSE2) +#include "x86/dnn_x86.h" +#endif + +#ifndef OVERRIDE_COMPUTE_LINEAR +#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_c(linear, out, in)) +#endif + +#if defined(__x86_64__) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2) +#if defined(_MSC_VER) +#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance") +#else +#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance" +#endif +#endif + + + #endif /* NNET_H_ */ diff --git a/dnn/nnet_arch.h b/dnn/nnet_arch.h new file mode 100644 index 00000000..00198579 --- /dev/null +++ b/dnn/nnet_arch.h @@ -0,0 +1,76 @@ +/* Copyright (c) 2018-2019 Mozilla + 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef NNET_ARCH_H +#define NNET_ARCH_H + +#include "nnet.h" +#include "arch.h" +#include "os_support.h" +#include "vec.h" + +#define CAT_SUFFIX2(a,b) a ## b +#define CAT_SUFFIX(a,b) CAT_SUFFIX2(a, b) + +#define RTCD_SUF(name) CAT_SUFFIX(name, RTCD_ARCH) + +void RTCD_SUF(compute_linear_) (const LinearLayer *linear, float *out, const float *in) +{ + int i, M, N; + const float *bias; + celt_assert(in != out); + bias = linear->bias; + M = linear->nb_inputs; + N = linear->nb_outputs; + if (linear->float_weights != NULL) { + if (linear->weights_idx != NULL) sparse_sgemv8x4(out, linear->float_weights, linear->weights_idx, N, in); + else sgemv(out, linear->float_weights, N, M, N, in); + } else if (linear->weights != NULL) { + if (linear->weights_idx != NULL) sparse_cgemv8x4(out, linear->weights, linear->weights_idx, linear->scale, N, M, in); + else cgemv8x4(out, linear->weights, linear->scale, N, M, in); + /* Only use SU biases on for integer matrices on SU archs. */ +#ifdef USE_SU_BIAS + bias = linear->subias; +#endif + } + else OPUS_CLEAR(out, N); + if (bias != NULL) { + for (i=0;idiag) { + /* Diag is only used for GRU recurrent weights. */ + celt_assert(3*M == N); + for (i=0;idiag[i]*in[i]; + out[i+M] += linear->diag[i+M]*in[i]; + out[i+2*M] += linear->diag[i+2*M]*in[i]; + } + } +} + + +#endif diff --git a/dnn/nnet_default.c b/dnn/nnet_default.c new file mode 100644 index 00000000..4316f0fb --- /dev/null +++ b/dnn/nnet_default.c @@ -0,0 +1,35 @@ +/* Copyright (c) 2018-2019 Mozilla + 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + + +#define RTCD_ARCH c + +#include "nnet_arch.h" diff --git a/dnn/pitchdnn.c b/dnn/pitchdnn.c index 5bb3a57c..1ca15dc6 100644 --- a/dnn/pitchdnn.c +++ b/dnn/pitchdnn.c @@ -12,7 +12,8 @@ float compute_pitchdnn( PitchDNNState *st, const float *if_features, - const float *xcorr_features + const float *xcorr_features, + int arch ) { float if1_out[DENSE_IF_UPSAMPLER_1_OUT_SIZE]; @@ -28,16 +29,16 @@ float compute_pitchdnn( float count=0; PitchDNN *model = &st->model; /* IF */ - compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH); - compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH); + compute_generic_dense(&model->dense_if_upsampler_1, if1_out, if_features, ACTIVATION_TANH, arch); + compute_generic_dense(&model->dense_if_upsampler_2, &downsampler_in[NB_XCORR_FEATURES], if1_out, ACTIVATION_TANH, arch); /* xcorr*/ OPUS_COPY(&conv1_tmp1[1], xcorr_features, NB_XCORR_FEATURES); compute_conv2d(&model->conv2d_1, &conv1_tmp2[1], st->xcorr_mem1, conv1_tmp1, NB_XCORR_FEATURES, NB_XCORR_FEATURES+2, ACTIVATION_TANH); compute_conv2d(&model->conv2d_2, downsampler_in, st->xcorr_mem2, conv1_tmp2, NB_XCORR_FEATURES, NB_XCORR_FEATURES, ACTIVATION_TANH); - compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH); - compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out); - compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR); + compute_generic_dense(&model->dense_downsampler, downsampler_out, downsampler_in, ACTIVATION_TANH, arch); + compute_generic_gru(&model->gru_1_input, &model->gru_1_recurrent, st->gru_state, downsampler_out, arch); + compute_generic_dense(&model->dense_final_upsampler, output, st->gru_state, ACTIVATION_LINEAR, arch); for (i=0;i<180;i++) { if (output[i] > maxval) { pos = i; @@ -65,7 +66,6 @@ void pitchdnn_init(PitchDNNState *st) ret = 0; #endif celt_assert(ret == 0); - /* FIXME: perform arch detection. */ } int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len) { diff --git a/dnn/pitchdnn.h b/dnn/pitchdnn.h index cdc4eb16..ed821412 100644 --- a/dnn/pitchdnn.h +++ b/dnn/pitchdnn.h @@ -27,7 +27,8 @@ int pitchdnn_load_model(PitchDNNState *st, const unsigned char *data, int len); float compute_pitchdnn( PitchDNNState *st, const float *if_features, - const float *xcorr_features + const float *xcorr_features, + int arch ); #endif diff --git a/dnn/vec_avx.h b/dnn/vec_avx.h index f0625158..73a55a22 100644 --- a/dnn/vec_avx.h +++ b/dnn/vec_avx.h @@ -655,11 +655,6 @@ static inline mm256i_emu opus_mm256_dpbusds_epi32(mm256i_emu src, mm256i_emu a, return res; } -#if defined(_MSC_VER) -#pragma message ("Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 to get better performance") -#else -#warning "Only SSE and SSE2 are available. On newer machines, enable SSSE3/AVX/AVX2 using -march= to get better performance" -#endif #else diff --git a/dnn/x86/dnn_x86.h b/dnn/x86/dnn_x86.h new file mode 100644 index 00000000..c0f2ffae --- /dev/null +++ b/dnn/x86/dnn_x86.h @@ -0,0 +1,78 @@ +/* Copyright (c) 2011-2019 Mozilla + 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifndef DNN_X86_H +#define DNN_X86_H + +#include "cpu_support.h" +#include "opus_types.h" + +#if defined(OPUS_X86_MAY_HAVE_SSE2) +void compute_linear_sse2(const LinearLayer *linear, float *out, const float *in); +#endif + +#if defined(OPUS_X86_MAY_HAVE_SSE4_1) +void compute_linear_sse4_1(const LinearLayer *linear, float *out, const float *in); +#endif + +#if defined(OPUS_X86_MAY_HAVE_AVX2) +void compute_linear_avx2(const LinearLayer *linear, float *out, const float *in); +#endif + + +#if defined(OPUS_X86_PRESUME_AVX2) + +#define OVERRIDE_COMPUTE_LINEAR +#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_avx2(linear, out, in)) + +#elif defined(OPUS_X86_PRESUME_SSE4_1) && !defined(OPUS_X86_MAY_HAVE_AVX2) + +#define OVERRIDE_COMPUTE_LINEAR +#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse4_1(linear, out, in)) + +#elif defined(OPUS_X86_PRESUME_SSE2) && !defined(OPUS_X86_MAY_HAVE_AVX2) && !defined(OPUS_X86_MAY_HAVE_SSE4_1) + +#define OVERRIDE_COMPUTE_LINEAR +#define compute_linear(linear, out, in, arch) ((void)(arch),compute_linear_sse2(linear, out, in)) + +#elif defined(OPUS_HAVE_RTCD) && (defined(OPUS_X86_MAY_HAVE_AVX2) || defined(OPUS_X86_MAY_HAVE_SSE4_1) || defined(OPUS_X86_MAY_HAVE_SSE2)) + +extern void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])( + const LinearLayer *linear, + float *out, + const float *in + ); + +#define OVERRIDE_COMPUTE_LINEAR +#define compute_linear(linear, out, in, arch) \ + ((*DNN_COMPUTE_LINEAR_IMPL[(arch) & OPUS_ARCHMASK])(linear, out, in)) + +#endif + + + +#endif /* DNN_X86_H */ diff --git a/dnn/x86/nnet_avx2.c b/dnn/x86/nnet_avx2.c new file mode 100644 index 00000000..f463b324 --- /dev/null +++ b/dnn/x86/nnet_avx2.c @@ -0,0 +1,38 @@ +/* Copyright (c) 2018-2019 Mozilla + 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef __AVX2__ +#error nnet_avx2.c is being compiled without AVX2 enabled +#endif + +#define RTCD_ARCH avx2 + +#include "nnet_arch.h" diff --git a/dnn/x86/nnet_sse2.c b/dnn/x86/nnet_sse2.c new file mode 100644 index 00000000..bcee5ccc --- /dev/null +++ b/dnn/x86/nnet_sse2.c @@ -0,0 +1,38 @@ +/* Copyright (c) 2018-2019 Mozilla + 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef __SSE2__ +#error nnet_sse2.c is being compiled without SSE2 enabled +#endif + +#define RTCD_ARCH sse2 + +#include "nnet_arch.h" diff --git a/dnn/x86/nnet_sse4_1.c b/dnn/x86/nnet_sse4_1.c new file mode 100644 index 00000000..4b530b65 --- /dev/null +++ b/dnn/x86/nnet_sse4_1.c @@ -0,0 +1,38 @@ +/* Copyright (c) 2018-2019 Mozilla + 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#ifndef __SSE4_1__ +#error nnet_sse4_1.c is being compiled without SSE4.1 enabled +#endif + +#define RTCD_ARCH sse4_1 + +#include "nnet_arch.h" diff --git a/dnn/x86/x86_dnn_map.c b/dnn/x86/x86_dnn_map.c new file mode 100644 index 00000000..35e061ff --- /dev/null +++ b/dnn/x86/x86_dnn_map.c @@ -0,0 +1,54 @@ +/* Copyright (c) 2018-2019 Mozilla + 2023 Amazon */ +/* + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions + are met: + + - Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + - Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. + + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR + CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, + PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR + PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF + LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING + NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS + SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +*/ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "x86/x86cpu.h" +#include "nnet.h" + +#if defined(OPUS_HAVE_RTCD) + +#if (defined(OPUS_X86_MAY_HAVE_SSE2) && !defined(OPUS_X86_PRESUME_AVX2)) + +void (*const DNN_COMPUTE_LINEAR_IMPL[OPUS_ARCHMASK + 1])( + const LinearLayer *linear, + float *out, + const float *in +) = { + compute_linear_c, /* non-sse */ + compute_linear_c, + MAY_HAVE_SSE2(compute_linear), + MAY_HAVE_SSE4_1(compute_linear), /* sse4.1 */ + MAY_HAVE_AVX2(compute_linear) /* avx */ +}; + +#endif + + +#endif diff --git a/lpcnet_headers.mk b/lpcnet_headers.mk index be8cf301..d3aa1516 100644 --- a/lpcnet_headers.mk +++ b/lpcnet_headers.mk @@ -12,7 +12,9 @@ dnn/vec.h \ dnn/vec_avx.h \ dnn/vec_neon.h \ dnn/pitchdnn.h \ -dnn/pitchdnn_data.h +dnn/pitchdnn_data.h \ +dnn/x86/dnn_x86.h \ +dnn/nnet_arch.h DRED_HEAD = \ silk/dred_coding.h \ diff --git a/lpcnet_sources.mk b/lpcnet_sources.mk index 09b8b462..ee3d79fd 100644 --- a/lpcnet_sources.mk +++ b/lpcnet_sources.mk @@ -7,6 +7,7 @@ dnn/lpcnet_enc.c \ dnn/lpcnet_plc.c \ dnn/lpcnet_tables.c \ dnn/nnet.c \ +dnn/nnet_default.c \ dnn/plc_data.c \ dnn/parse_lpcnet_weights.c \ dnn/pitchdnn.c \ @@ -21,3 +22,8 @@ dnn/dred_rdovae_stats_data.c \ silk/dred_encoder.c \ silk/dred_coding.c \ silk/dred_decoder.c + +DNN_SOURCES_X86_RTCD = dnn/x86/x86_dnn_map.c +DNN_SOURCES_AVX2 = dnn/x86/nnet_avx2.c +DNN_SOURCES_SSE4_1 = dnn/x86/nnet_sse4_1.c +DNN_SOURCES_SSE2 = dnn/x86/nnet_sse2.c diff --git a/silk/dred_encoder.c b/silk/dred_encoder.c index b567a223..64ff2c7c 100644 --- a/silk/dred_encoder.c +++ b/silk/dred_encoder.c @@ -87,7 +87,7 @@ void dred_encoder_init(DREDEnc* enc, opus_int32 Fs, int channels) dred_encoder_reset(enc); } -static void dred_process_frame(DREDEnc *enc) +static void dred_process_frame(DREDEnc *enc, int arch) { float feature_buffer[2 * 36]; float input_buffer[2*DRED_NUM_FEATURES] = {0}; @@ -97,15 +97,15 @@ static void dred_process_frame(DREDEnc *enc) OPUS_MOVE(enc->latents_buffer + DRED_LATENT_DIM, enc->latents_buffer, (DRED_MAX_FRAMES - 1) * DRED_LATENT_DIM); /* calculate LPCNet features */ - lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer); - lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36); + lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer, feature_buffer, arch); + lpcnet_compute_single_frame_features_float(&enc->lpcnet_enc_state, enc->input_buffer + DRED_FRAME_SIZE, feature_buffer + 36, arch); /* prepare input buffer (discard LPC coefficients) */ OPUS_COPY(input_buffer, feature_buffer, DRED_NUM_FEATURES); OPUS_COPY(input_buffer + DRED_NUM_FEATURES, feature_buffer + 36, DRED_NUM_FEATURES); /* run RDOVAE encoder */ - dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer); + dred_rdovae_encode_dframe(&enc->rdovae_enc, &enc->model, enc->latents_buffer, enc->state_buffer, input_buffer, arch); enc->latents_buffer_fill = IMIN(enc->latents_buffer_fill+1, DRED_NUM_REDUNDANCY_FRAMES); } @@ -188,7 +188,7 @@ static void dred_convert_to_16k(DREDEnc *enc, const float *in, int in_len, float } } -void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay) +void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch) { int curr_offset16k; int frame_size16k = frame_size * 16000 / enc->Fs; @@ -206,7 +206,7 @@ void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int ex if (enc->input_buffer_fill >= 2*DRED_FRAME_SIZE) { curr_offset16k += 320; - dred_process_frame(enc); + dred_process_frame(enc, arch); enc->input_buffer_fill -= 2*DRED_FRAME_SIZE; OPUS_MOVE(&enc->input_buffer[0], &enc->input_buffer[2*DRED_FRAME_SIZE], enc->input_buffer_fill); /* 15 ms (6*2.5 ms) is the ideal offset for DRED because it corresponds to our vocoder look-ahead. */ diff --git a/silk/dred_encoder.h b/silk/dred_encoder.h index abeaac7f..d1d2376d 100644 --- a/silk/dred_encoder.h +++ b/silk/dred_encoder.h @@ -64,7 +64,7 @@ void dred_encoder_reset(DREDEnc* enc); void dred_deinit_encoder(DREDEnc *enc); -void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay); +void dred_compute_latents(DREDEnc *enc, const float *pcm, int frame_size, int extra_delay, int arch); int dred_encode_silk_frame(const DREDEnc *enc, unsigned char *buf, int max_chunks, int max_bytes); diff --git a/src/opus_decoder.c b/src/opus_decoder.c index 73be6f3b..1e0a1da4 100644 --- a/src/opus_decoder.c +++ b/src/opus_decoder.c @@ -1424,7 +1424,7 @@ int opus_dred_process(OpusDREDDecoder *dred_dec, const OpusDRED *src, OpusDRED * OPUS_COPY(dst, src, 1); if (dst->process_stage == 2) return OPUS_OK; - DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents); + DRED_rdovae_decode_all(&dred_dec->model, dst->fec_features, dst->state, dst->latents, dst->nb_latents, dred_dec->arch); dst->process_stage = 2; return OPUS_OK; #else diff --git a/src/opus_encoder.c b/src/opus_encoder.c index 27b3196a..28da18af 100644 --- a/src/opus_encoder.c +++ b/src/opus_encoder.c @@ -1715,7 +1715,7 @@ opus_int32 opus_encode_native(OpusEncoder *st, const opus_val16 *pcm, int frame_ #ifdef ENABLE_DRED if ( st->dred_duration > 0 && st->dred_encoder.loaded ) { /* DRED Encoder */ - dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer ); + dred_compute_latents( &st->dred_encoder, &pcm_buf[total_buffer*st->channels], frame_size, total_buffer, st->arch ); } else { st->dred_encoder.latents_buffer_fill = 0; }